Skip to content

Commit fb75d95

Browse files
New vignette (for Boston data)
1 parent 17c35a0 commit fb75d95

File tree

8 files changed

+373
-903
lines changed

8 files changed

+373
-903
lines changed

inst/doc/randomForestExplainer.R

Lines changed: 31 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,80 @@
11
## ----setup, include=FALSE------------------------------------------------
2-
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
2+
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 7, fig.height = 5)
33

44
## ------------------------------------------------------------------------
55
library(randomForest)
6+
# devtools::install_github("MI2DataLab/randomForestExplainer")
67
library(randomForestExplainer)
78

89
## ------------------------------------------------------------------------
9-
load("GlioblastomaWide.rda")
10-
GlioblastomaWide <- GlioblastomaWide[, -1]
11-
GlioblastomaWide$death1y <- as.factor(GlioblastomaWide$death1y)
12-
remove <- colSums(is.na(GlioblastomaWide))
13-
GlioblastomaWide <- GlioblastomaWide[, remove == 0]
14-
rm(remove)
10+
data(Boston, package = "MASS")
11+
Boston$chas <- as.logical(Boston$chas)
12+
str(Boston)
1513

1614
## ------------------------------------------------------------------------
17-
# set.seed(2017)
18-
# forest <- randomForest(death1y ~ ., data = GlioblastomaWide, ntree = 10000, localImp = TRUE)
19-
# save(forest, file = "GlioblastomaWide_forest.rda")
20-
load("GlioblastomaWide_forest.rda")
21-
22-
## ------------------------------------------------------------------------
23-
plot(forest, main = "Learning curve of the forest")
24-
legend("topright", c("error for 'dead'", "misclassification error", "error for 'alive'"), lty = c(1,1,1), col = c("green", "black", "red"))
15+
set.seed(2017)
16+
forest <- randomForest(medv ~ ., data = Boston, localImp = TRUE)
2517

2618
## ------------------------------------------------------------------------
2719
forest
2820

2921
## ------------------------------------------------------------------------
30-
min_depth_frame <- min_depth_distribution(forest)
22+
# min_depth_frame <- min_depth_distribution(forest)
23+
# save(min_depth_frame, file = "min_depth_frame.rda")
24+
load("min_depth_frame.rda")
3125
head(min_depth_frame, n = 10)
3226

3327
## ------------------------------------------------------------------------
28+
# plot_min_depth_distribution(forest) # gives the same result as below but takes longer
3429
plot_min_depth_distribution(min_depth_frame)
3530

36-
## ------------------------------------------------------------------------
37-
plot_min_depth_distribution(min_depth_frame, min_no_of_trees = 60, mean_sample = "relevant_trees")
31+
## ----fig.height = 7------------------------------------------------------
32+
plot_min_depth_distribution(min_depth_frame, mean_sample = "relevant_trees", k = 15)
3833

3934
## ------------------------------------------------------------------------
4035
# importance_frame <- measure_importance(forest)
41-
# save(importance_frame, file = "GlioblastomaWide_importance_frame.rda")
42-
load("GlioblastomaWide_importance_frame.rda")
43-
head(importance_frame, n = 10)
36+
# save(importance_frame, file = "importance_frame.rda")
37+
load("importance_frame.rda")
38+
importance_frame
4439

4540
## ------------------------------------------------------------------------
46-
plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes", min_no_of_trees = 30)
41+
# plot_multi_way_importance(forest, size_measure = "no_of_nodes") # gives the same result as below but takes longer
42+
plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")
4743

4844
## ------------------------------------------------------------------------
49-
plot_multi_way_importance(importance_frame, x_measure = "accuracy_decrease", y_measure = "gini_decrease", size_measure = "p_value")
45+
plot_multi_way_importance(importance_frame, x_measure = "mse_increase", y_measure = "node_purity_increase", size_measure = "p_value", no_of_labels = 5)
5046

5147
## ------------------------------------------------------------------------
48+
# plot_importance_ggpairs(forest) # gives the same result as below but takes longer
5249
plot_importance_ggpairs(importance_frame)
5350

5451
## ------------------------------------------------------------------------
52+
# plot_importance_rankings(forest) # gives the same result as below but takes longer
5553
plot_importance_rankings(importance_frame)
5654

5755
## ------------------------------------------------------------------------
58-
(vars <- important_variables(importance_frame, k = 20, measures = c("mean_min_depth", "no_of_trees")))
56+
# (vars <- important_variables(forest, k = 5, measures = c("mean_min_depth", "no_of_trees"))) # gives the same result as below but takes longer
57+
(vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees")))
5958

6059
## ------------------------------------------------------------------------
6160
# interactions_frame <- min_depth_interactions(forest, vars)
62-
# save(interactions_frame, file = "GlioblastomaWide_interactions_frame.rda")
63-
load("GlioblastomaWide_interactions_frame.rda")
64-
head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ])
65-
66-
## ------------------------------------------------------------------------
67-
# set.seed(2017)
68-
# forest_v2 <- randomForest(death1y ~ ., data = GlioblastomaWide, ntree = 10000, mtry = floor(ncol(GlioblastomaWide)/3), importance = TRUE, localImp = TRUE)
69-
# save(forest_v2, file = "GlioblastomaWide_forest_v2.rda")
70-
load("GlioblastomaWide_forest_v2.rda"); rm(forest)
71-
importance_frame <- measure_importance(forest_v2)
72-
vars <- important_variables(importance_frame, k = 20, measures = c("mean_min_depth", "no_of_trees"))
73-
# interactions_frame <- min_depth_interactions(forest_v2, vars)
74-
# save(interactions_frame, file = "GlioblastomaWide_interactions_frame_v2.rda")
75-
load("GlioblastomaWide_interactions_frame_v2.rda")
61+
# save(interactions_frame, file = "interactions_frame.rda")
62+
load("interactions_frame.rda")
7663
head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ])
7764

7865
## ------------------------------------------------------------------------
66+
# plot_min_depth_interactions(forest) # calculates the interactions_frame for default settings so may give different results than the function below depending on our settings and takes more time
7967
plot_min_depth_interactions(interactions_frame)
8068

8169
## ------------------------------------------------------------------------
82-
interactions_frame <- min_depth_interactions(forest_v2, vars, mean_sample = "relevant_trees", uncond_mean_sample = "relevant_trees")
70+
# interactions_frame <- min_depth_interactions(forest, vars, mean_sample = "relevant_trees", uncond_mean_sample = "relevant_trees")
71+
# save(interactions_frame, file = "interactions_frame_relevant.rda")
72+
load("interactions_frame_relevant.rda")
8373
plot_min_depth_interactions(interactions_frame)
8474

8575
## ------------------------------------------------------------------------
86-
plot_predict_interaction(forest_v2, GlioblastomaWide, "SLC17A9", "IFIT2", grid = 80)
87-
88-
## ---- eval = FALSE-------------------------------------------------------
89-
# explain_forest(forest, interactions = TRUE, GlioblastomaWide, pred_grid = 70)
90-
91-
## ---- eval = FALSE-------------------------------------------------------
92-
# explain_forest(forest_v2, interactions = TRUE, GlioblastomaWide, pred_grid = 80)
93-
94-
## ---- eval = FALSE-------------------------------------------------------
95-
# # source("https://bioconductor.org/biocLite.R")
96-
# # biocLite("DESeq")
97-
# # biocLite("limma")
98-
# # biocLite("TxDb.Hsapiens.UCSC.hg18.knownGene")
99-
# # biocLite("org.Hs.eg.db")
100-
# # biocLite("DESeq2")
101-
# # biocLite("edgeR")
102-
# # devtools::install_github("geneticsMiNIng/MLGenSig", subdir = "MetExpR")
103-
#
104-
# # brca <- MetExpR::BRCA_mRNAseq_chr17
105-
# # colnames(brca) <- make.names(colnames(brca))
106-
# # brca$SUBTYPE <- factor(brca$SUBTYPE)
107-
#
108-
# # save(brca, file = "BreastCancer.rda")
109-
# load("BreastCancer.rda")
110-
111-
## ---- eval = FALSE-------------------------------------------------------
112-
# # set.seed(2017)
113-
# # forest_brca <- randomForest(SUBTYPE ~ ., data = brca, ntree = 10000, localImp = TRUE)
114-
# # save(forest_brca, file = "BreastCancer_forest.rda")
115-
# load("BreastCancer_forest.rda")
116-
# explain_forest(forest_brca, interactions = TRUE, brca)
117-
118-
## ---- eval = FALSE-------------------------------------------------------
119-
# # devtools::install_github("pbiecek/PISA2012lite")
120-
# # library("PISA2012lite")
121-
#
122-
# # pisa <- na.omit(student2012[,c(1, 4, 12, 13, 18:20, 39, 61:62, 114, 488, 457, 501)])
123-
# # pisa <- pisa[pisa$CNT %in% c("Austria", "Belgium", "Czech Republic", "Germany", "Denmark", "Spain", "Estonia", "Finland", "France", "United Kingdom", "Greece", "Hungary", "Ireland", "Italy", "Lithuania", "Luxembourg", "Latvia", "Netherlands", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Sweden", "Romania", "Croatia", "Bulgaria"),] # consider only EU countries to reduce the size
124-
# # save(pisa, file = "PISA.rda")
125-
# load("PISA.rda")
126-
#
127-
# # Further reduce the size
128-
# pisa <- pisa[pisa$CNT %in% c("Czech Republic", "Hungary", "Poland", "Slovak Republic"), ] # only the Visegrad group
129-
# pisa$CNT <- factor(pisa$CNT)
130-
131-
## ---- eval = FALSE-------------------------------------------------------
132-
# # set.seed(2017)
133-
# # forest_pisa <- randomForest(PV1MATH ~ ., data = pisa, localImp = TRUE)
134-
# # save(forest_pisa, file = "PISA_forest.rda")
135-
# load("PISA_forest.rda")
136-
# explain_forest(forest_pisa, interactions = TRUE, pisa)
76+
plot_predict_interaction(forest, Boston, "rm", "lstat")
13777

13878
## ---- eval = FALSE-------------------------------------------------------
139-
# data(Boston, package = "MASS")
140-
# Boston$chas <- as.logical(Boston$chas)
141-
# # set.seed(2017)
142-
# # forest_Boston <- randomForest(medv ~ ., data = Boston, ntree = 1000, localImp = TRUE)
143-
# # save(forest_Boston, file = "Boston_forest.rda")
144-
# load("Boston_forest.rda")
145-
# explain_forest(forest_Boston, interactions = TRUE, Boston)
79+
# explain_forest(forest, interactions = TRUE, data = Boston)
14680

0 commit comments

Comments
 (0)