-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_rf_RT.R
115 lines (96 loc) · 4.22 KB
/
ml_rf_RT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
library(tidyverse)
library(caret)
library(ranger) # fast implementation of random forest
source("file_loading.R")
source("preprocessing_fxns.R")
source("rank_transform_fxns.R")
set.seed(100)
num.studies <- length(studies.df) # Number of datasets
studies.df <- lapply(studies.df, geneSort)
commonGeneNames <- getGeneSymbols(studies.df[[1]])
for (i in 2:num.studies) {
commonGeneNames <- intersect(commonGeneNames, getGeneSymbols(studies.df[[i]]))
}
numCommonGenes <- length(commonGeneNames) # Number of common genes
ranked.studies.df <- lapply(studies.df, matchCommonGenes) %>%
lapply(rankTransform)
# The labels for patients with a label
labels <- lapply(ranked.studies.df, function(x){ x$sampInfo$cluster.MT[!is.na(x$sampInfo$cluster.MT)] })
studies <- lapply(ranked.studies.df, extractData,
common.gene.names = commonGeneNames) %>%
reorderGeneBySignificance(common.gene.names = commonGeneNames)
# This variable indicates how many genes of top differential expression are kept
df.length <- ceiling(length(commonGeneNames)/5)
# Append the classification to the training data
for (i in 1:num.studies) {
studies[[i]] <- add_column(studies[[i]][ , 1:df.length], class = labels[[i]])
}
# We create an empty tibble first to hold the leanring results
learning.results <- tibble(learning.set = rep('NA', length(studies)^2),
validation.set = rep('NA', length(studies)^2),
accuracy = rep(0, length(studies)^2),
sensitivity = rep(0, length(studies)^2),
specificity = rep(0, length(studies)^2))
len <- length(studies)
final_models <- list()
for (i in 1:len) {
studies.min.1 <- studies[-i]
studies.names.min.1 <- studies.names[-i]
validation.set <- studies[[i]][, 1:df.length]
truth <- studies[[i]]$class # The truth vector
## Run the RF (with 1500 trees) and SVM on one dataset, and validate on studies[i]
for (j in 1:length(studies.min.1)) {
learning.set <- studies.min.1[[j]]
# Random Forest
rf <- ranger::ranger(class ~ ., data = learning.set, num.trees = 1500, probability = TRUE)
# We store the models into the list
final_models[[5 * (i - 1) + j]] <- rf
pred <- predict(rf, validation.set)
pred_column <- factor(ifelse(pred$predictions[,1] > 0.5, "basal", "classical"),
levels = c("basal", "classical"))
confusion.mat <- confusionMatrix(data = pred_column,
reference = truth)
accu <- confusion.mat$overall[["Accuracy"]]
sen <- confusion.mat$byClass[["Sensitivity"]]
spe <- confusion.mat$byClass[["Specificity"]]
learning.results[5 * (i - 1) + j,] <-
c(
studies.names.min.1[j],
studies.names[i],
signif(accu, digits = 4),
signif(sen, digits = 4),
signif(spe, digits = 4)
)
}
## Run the rf on combined datasets
learning.set <- studies.min.1[[1]]
for (k in 1:(length(studies.min.1) - 1)) {
learning.set <- rbind(learning.set, studies.min.1[[1 + k]])
}
rf <- ranger::ranger(class ~ ., data = learning.set, num.trees = 1500, probability = TRUE)
final_models[[5 * i]] <- rf
pred <- predict(rf, validation.set)
pred_column <- factor(ifelse(pred$predictions[,1] > 0.5, "basal", "classical"),
levels = c("basal", "classical"))
confusion.mat <- confusionMatrix(data = pred_column,
reference = truth)
accu <- confusion.mat$overall[["Accuracy"]]
sen <- confusion.mat$byClass[["Sensitivity"]]
spe <- confusion.mat$byClass[["Specificity"]]
learning.results[5 * i,] <-
c(
paste("comb. minus", studies.names[i],
sep = " "),
studies.names[i],
signif(accu, digits = 4),
signif(sen, digits = 4),
signif(spe, digits = 4)
)
}
# We compile a list of the outputs that contains the models as well as the result table
final_results <- list("models" = final_models,
"results" = learning.results)
print(learning.results, n = length(studies)^2)
write.table(learning.results, file = "result_tables/learning_results_RF_RT.csv")
save(x = final_results, file = "models_and_predictions/learning_result_RF_RT.Rdata")
save.image()