|
1 | 1 | ## ----setup, include=FALSE------------------------------------------------ |
2 | | -knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) |
| 2 | +knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 7, fig.height = 5) |
3 | 3 |
|
4 | 4 | ## ------------------------------------------------------------------------ |
5 | 5 | library(randomForest) |
| 6 | +# devtools::install_github("MI2DataLab/randomForestExplainer") |
6 | 7 | library(randomForestExplainer) |
7 | 8 |
|
8 | 9 | ## ------------------------------------------------------------------------ |
9 | | -load("GlioblastomaWide.rda") |
10 | | -GlioblastomaWide <- GlioblastomaWide[, -1] |
11 | | -GlioblastomaWide$death1y <- as.factor(GlioblastomaWide$death1y) |
12 | | -remove <- colSums(is.na(GlioblastomaWide)) |
13 | | -GlioblastomaWide <- GlioblastomaWide[, remove == 0] |
14 | | -rm(remove) |
| 10 | +data(Boston, package = "MASS") |
| 11 | +Boston$chas <- as.logical(Boston$chas) |
| 12 | +str(Boston) |
15 | 13 |
|
16 | 14 | ## ------------------------------------------------------------------------ |
17 | | -# set.seed(2017) |
18 | | -# forest <- randomForest(death1y ~ ., data = GlioblastomaWide, ntree = 10000, localImp = TRUE) |
19 | | -# save(forest, file = "GlioblastomaWide_forest.rda") |
20 | | -load("GlioblastomaWide_forest.rda") |
21 | | - |
22 | | -## ------------------------------------------------------------------------ |
23 | | -plot(forest, main = "Learning curve of the forest") |
24 | | -legend("topright", c("error for 'dead'", "misclassification error", "error for 'alive'"), lty = c(1,1,1), col = c("green", "black", "red")) |
| 15 | +set.seed(2017) |
| 16 | +forest <- randomForest(medv ~ ., data = Boston, localImp = TRUE) |
25 | 17 |
|
26 | 18 | ## ------------------------------------------------------------------------ |
27 | 19 | forest |
28 | 20 |
|
29 | 21 | ## ------------------------------------------------------------------------ |
30 | | -min_depth_frame <- min_depth_distribution(forest) |
| 22 | +# min_depth_frame <- min_depth_distribution(forest) |
| 23 | +# save(min_depth_frame, file = "min_depth_frame.rda") |
| 24 | +load("min_depth_frame.rda") |
31 | 25 | head(min_depth_frame, n = 10) |
32 | 26 |
|
33 | 27 | ## ------------------------------------------------------------------------ |
| 28 | +# plot_min_depth_distribution(forest) # gives the same result as below but takes longer |
34 | 29 | plot_min_depth_distribution(min_depth_frame) |
35 | 30 |
|
36 | | -## ------------------------------------------------------------------------ |
37 | | -plot_min_depth_distribution(min_depth_frame, min_no_of_trees = 60, mean_sample = "relevant_trees") |
| 31 | +## ----fig.height = 7------------------------------------------------------ |
| 32 | +plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 15) |
38 | 33 |
|
39 | 34 | ## ------------------------------------------------------------------------ |
40 | 35 | # importance_frame <- measure_importance(forest) |
41 | | -# save(importance_frame, file = "GlioblastomaWide_importance_frame.rda") |
42 | | -load("GlioblastomaWide_importance_frame.rda") |
43 | | -head(importance_frame, n = 10) |
| 36 | +# save(importance_frame, file = "importance_frame.rda") |
| 37 | +load("importance_frame.rda") |
| 38 | +importance_frame |
44 | 39 |
|
45 | 40 | ## ------------------------------------------------------------------------ |
46 | | -plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes", min_no_of_trees = 30) |
| 41 | +# plot_multi_way_importance(forest, size_measure = "no_of_nodes") # gives the same result as below but takes longer |
| 42 | +plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes") |
47 | 43 |
|
48 | 44 | ## ------------------------------------------------------------------------ |
49 | | -plot_multi_way_importance(importance_frame, x_measure = "accuracy_decrease", y_measure = "gini_decrease", size_measure = "p_value") |
| 45 | +plot_multi_way_importance(importance_frame, x_measure = "mse_increase", y_measure = "node_purity_increase", size_measure = "p_value", no_of_labels = 5) |
50 | 46 |
|
51 | 47 | ## ------------------------------------------------------------------------ |
| 48 | +# plot_importance_ggpairs(forest) # gives the same result as below but takes longer |
52 | 49 | plot_importance_ggpairs(importance_frame) |
53 | 50 |
|
54 | 51 | ## ------------------------------------------------------------------------ |
| 52 | +# plot_importance_rankings(forest) # gives the same result as below but takes longer |
55 | 53 | plot_importance_rankings(importance_frame) |
56 | 54 |
|
57 | 55 | ## ------------------------------------------------------------------------ |
58 | | -(vars <- important_variables(importance_frame, k = 20, measures = c("mean_min_depth", "no_of_trees"))) |
| 56 | +# (vars <- important_variables(forest, k = 5, measures = c("mean_min_depth", "no_of_trees"))) # gives the same result as below but takes longer |
| 57 | +(vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees"))) |
59 | 58 |
|
60 | 59 | ## ------------------------------------------------------------------------ |
61 | 60 | # interactions_frame <- min_depth_interactions(forest, vars) |
62 | | -# save(interactions_frame, file = "GlioblastomaWide_interactions_frame.rda") |
63 | | -load("GlioblastomaWide_interactions_frame.rda") |
64 | | -head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ]) |
65 | | - |
66 | | -## ------------------------------------------------------------------------ |
67 | | -# set.seed(2017) |
68 | | -# forest_v2 <- randomForest(death1y ~ ., data = GlioblastomaWide, ntree = 10000, mtry = floor(ncol(GlioblastomaWide)/3), importance = TRUE, localImp = TRUE) |
69 | | -# save(forest_v2, file = "GlioblastomaWide_forest_v2.rda") |
70 | | -load("GlioblastomaWide_forest_v2.rda"); rm(forest) |
71 | | -importance_frame <- measure_importance(forest_v2) |
72 | | -vars <- important_variables(importance_frame, k = 20, measures = c("mean_min_depth", "no_of_trees")) |
73 | | -# interactions_frame <- min_depth_interactions(forest_v2, vars) |
74 | | -# save(interactions_frame, file = "GlioblastomaWide_interactions_frame_v2.rda") |
75 | | -load("GlioblastomaWide_interactions_frame_v2.rda") |
| 61 | +# save(interactions_frame, file = "interactions_frame.rda") |
| 62 | +load("interactions_frame.rda") |
76 | 63 | head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ]) |
77 | 64 |
|
78 | 65 | ## ------------------------------------------------------------------------ |
| 66 | +# plot_min_depth_interactions(forest) # calculates the interactions_frame for default settings so may give different results than the function below depending on our settings and takes more time |
79 | 67 | plot_min_depth_interactions(interactions_frame) |
80 | 68 |
|
81 | 69 | ## ------------------------------------------------------------------------ |
82 | | -interactions_frame <- min_depth_interactions(forest_v2, vars, mean_sample = "relevant_trees", uncond_mean_sample = "relevant_trees") |
| 70 | +# interactions_frame <- min_depth_interactions(forest, vars, mean_sample = "relevant_trees", uncond_mean_sample = "relevant_trees") |
| 71 | +# save(interactions_frame, file = "interactions_frame_relevant.rda") |
| 72 | +load("interactions_frame_relevant.rda") |
83 | 73 | plot_min_depth_interactions(interactions_frame) |
84 | 74 |
|
85 | 75 | ## ------------------------------------------------------------------------ |
86 | | -plot_predict_interaction(forest_v2, GlioblastomaWide, "SLC17A9", "IFIT2", grid = 80) |
87 | | - |
88 | | -## ---- eval = FALSE------------------------------------------------------- |
89 | | -# explain_forest(forest, interactions = TRUE, GlioblastomaWide, pred_grid = 70) |
90 | | - |
91 | | -## ---- eval = FALSE------------------------------------------------------- |
92 | | -# explain_forest(forest_v2, interactions = TRUE, GlioblastomaWide, pred_grid = 80) |
93 | | - |
94 | | -## ---- eval = FALSE------------------------------------------------------- |
95 | | -# # source("https://bioconductor.org/biocLite.R") |
96 | | -# # biocLite("DESeq") |
97 | | -# # biocLite("limma") |
98 | | -# # biocLite("TxDb.Hsapiens.UCSC.hg18.knownGene") |
99 | | -# # biocLite("org.Hs.eg.db") |
100 | | -# # biocLite("DESeq2") |
101 | | -# # biocLite("edgeR") |
102 | | -# # devtools::install_github("geneticsMiNIng/MLGenSig", subdir = "MetExpR") |
103 | | -# |
104 | | -# # brca <- MetExpR::BRCA_mRNAseq_chr17 |
105 | | -# # colnames(brca) <- make.names(colnames(brca)) |
106 | | -# # brca$SUBTYPE <- factor(brca$SUBTYPE) |
107 | | -# |
108 | | -# # save(brca, file = "BreastCancer.rda") |
109 | | -# load("BreastCancer.rda") |
110 | | - |
111 | | -## ---- eval = FALSE------------------------------------------------------- |
112 | | -# # set.seed(2017) |
113 | | -# # forest_brca <- randomForest(SUBTYPE ~ ., data = brca, ntree = 10000, localImp = TRUE) |
114 | | -# # save(forest_brca, file = "BreastCancer_forest.rda") |
115 | | -# load("BreastCancer_forest.rda") |
116 | | -# explain_forest(forest_brca, interactions = TRUE, brca) |
117 | | - |
118 | | -## ---- eval = FALSE------------------------------------------------------- |
119 | | -# # devtools::install_github("pbiecek/PISA2012lite") |
120 | | -# # library("PISA2012lite") |
121 | | -# |
122 | | -# # pisa <- na.omit(student2012[,c(1, 4, 12, 13, 18:20, 39, 61:62, 114, 488, 457, 501)]) |
123 | | -# # pisa <- pisa[pisa$CNT %in% c("Austria", "Belgium", "Czech Republic", "Germany", "Denmark", "Spain", "Estonia", "Finland", "France", "United Kingdom", "Greece", "Hungary", "Ireland", "Italy", "Lithuania", "Luxembourg", "Latvia", "Netherlands", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Sweden", "Romania", "Croatia", "Bulgaria"),] # consider only EU countries to reduce the size |
124 | | -# # save(pisa, file = "PISA.rda") |
125 | | -# load("PISA.rda") |
126 | | -# |
127 | | -# # Further reduce the size |
128 | | -# pisa <- pisa[pisa$CNT %in% c("Czech Republic", "Hungary", "Poland", "Slovak Republic"), ] # only the Visegrad group |
129 | | -# pisa$CNT <- factor(pisa$CNT) |
130 | | - |
131 | | -## ---- eval = FALSE------------------------------------------------------- |
132 | | -# # set.seed(2017) |
133 | | -# # forest_pisa <- randomForest(PV1MATH ~ ., data = pisa, localImp = TRUE) |
134 | | -# # save(forest_pisa, file = "PISA_forest.rda") |
135 | | -# load("PISA_forest.rda") |
136 | | -# explain_forest(forest_pisa, interactions = TRUE, pisa) |
| 76 | +plot_predict_interaction(forest, Boston, "rm", "lstat") |
137 | 77 |
|
138 | 78 | ## ---- eval = FALSE------------------------------------------------------- |
139 | | -# data(Boston, package = "MASS") |
140 | | -# Boston$chas <- as.logical(Boston$chas) |
141 | | -# # set.seed(2017) |
142 | | -# # forest_Boston <- randomForest(medv ~ ., data = Boston, ntree = 1000, localImp = TRUE) |
143 | | -# # save(forest_Boston, file = "Boston_forest.rda") |
144 | | -# load("Boston_forest.rda") |
145 | | -# explain_forest(forest_Boston, interactions = TRUE, Boston) |
| 79 | +# explain_forest(forest, interactions = TRUE, data = Boston) |
146 | 80 |
|
0 commit comments