cfsf-datasci_dot-hazmat/random_forest_v2.Rmd at master · bayeshack2016/cfsf-datasci_dot-hazmat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
---
title: "Development of Machine Learning Models on Prediction of Hazmat Incidences"
date: "April 24, 2016"
output:
    md_document:
        variant: markdown_github
---

```{r, echo=F, comment=FALSE, message=FALSE}
library(dplyr)
library(caret)
library(stringr)
```

#### Load Data with Merge Hazmat Incidents with Oil, Gas, and Employment Statistics
```{r}
    all_data <- read.csv("../Bayes_Hack/employment_gas_oil_data_merge.csv") %>% rename(year = Year)
    # Get rid of territories since they are missing data
    territories <- c("Guam", "Puerto Rico", "Virgin Islands", "Dist. of Columbia" )
    all_data_mod <- all_data %>% filter(!(State %in% territories), year!=2016 )
```


#### Prototype Machine Learning Models with Caret:

Create test and training sets:
```{r}
set.seed(5678)
all_data_clean <- all_data_mod %>% select(-year, -State)
    ind <- sample(2, nrow(all_data_clean), replace = TRUE, prob = c(.75, .25))
    training_set <- all_data_clean[ind ==1,]
    test_set <- all_data_clean[ind ==2,]

    #------------ For 5-fold cross validation for all models -----#
    train_control <- trainControl(method = "cv", number = 5)
    #--------------------------------------------------------------#
```

Fit Different Models to the training set:
```{r, warning=FALSE, message=FALSE}

# K-Nearest Neighbor Models
knn_fit <- train(number_of_hazardous_incidents ~. , training_set, method = "knn",
                       preProcess = c("center", "scale"), trControl = train_control,
                       tuneGrid = expand.grid(k = c(1,2,3,4,5,6)))

knn_fit_pca <- train(number_of_hazardous_incidents ~. , training_set, method = "knn",
                           preProcess = c("scale", "pca"), trControl = train_control,
                           tuneGrid = expand.grid(k = c(1,2,3,4,5,6, 7, 8, 9, 10)))
random_knn <- train(number_of_hazardous_incidents ~. , training_set, method = "rknn",
                          preProcess = c("center", "scale"), trControl = train_control,
                          tuneGrid = expand.grid(k = c(1,2,3,4,5,6,7,8,9),mtry = c(1)))

forest_fit <- train(number_of_hazardous_incidents ~. , training_set, method = "rf",
                          preProcess = c("center", "scale"), trControl = train_control,
                          tuneGrid = expand.grid(mtry = c(6,7,8,9,10)))

knn_fit
knn_fit_pca
random_knn
forest_fit
```

Comparison of Parameter selection on RSME for the different models:
```{r, comment=FALSE}
library(gridExtra)
grid.arrange(   plot(knn_fit, main = "KNN"),
                plot(knn_fit_pca, main = "KNN w/ PCA"),
                plot(random_knn, main = "Random KNN"),
                plot(forest_fit, main = "Random Forest"))
```

The Random Forest is the best performing model. Let's run the model on the test set:
```{r}
forest_predict <- predict(forest_fit, newdata = test_set)
forest_RMSE <- RMSE(test_set$number_of_hazardous_incidents, forest_predict)
    qplot(forest_predict, test_set$number_of_hazardous_incidents,
          xlab = "Predicted Values", ylab = "Actual Number of Accidents",
          main = "Fit of Predicted vs Actual Values from Random Forest Model") + geom_abline() +
    annotate("text", x = 1000, y = 400, label = paste0("RMSE: ", forest_RMSE), col = "violet")

```

Measure of Feature Importance:
```{r,comment=FALSE, message=FALSE}
library(randomForest)
random_forest <- randomForest(number_of_hazardous_incidents ~.,data = training_set,
                              mtry = 10, importance = T)
importance(random_forest)

```
Number of employee, number of acres leased for energy development, and number of leases for producing energy have the highes predictive value for predicting hazmat incidences.


#### Data visualization of top three Features:
```{r, comment=FALSE, warning=FALSE, message=F}
important_features <- all_data_clean %>% select(employees_mining_logging_1000, num_acres_leased,num_producing_leases, number_of_hazardous_incidents)

rpartTune <- train(number_of_hazardous_incidents ~ .,
                   data = important_features, method = "rpart",
                   trControl = train_control)

rpartTune
```

Decision Tree Model:
```{r, message=FALSE}
library(rattle)
library(rpart.plot)
fancyRpartPlot(rpartTune$finalModel)
```

This CART tree illustrates the effect that the most important features have on Hazmat related incidences. States that have more mining and logging employees tend to have more incidents, as well as states with more acres and leases involved in oil and gas production.