Skip to content

Commit d2352f7

Browse files
committed
calculation of shap for h2o models
1 parent 65aed50 commit d2352f7

File tree

1 file changed

+44
-45
lines changed

1 file changed

+44
-45
lines changed

lib/calculate_shap.py

Lines changed: 44 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
1010
"""
1111

12+
1213
class calculate_shap():
1314
def __init__(self):
1415
super(calculate_shap, self).__init__()
@@ -64,16 +65,14 @@ def catboost_shap(self, model, df, y_variable=None):
6465

6566
# final_df = df.join(shap_values)
6667

67-
shap_columns= shap_values.columns
68+
shap_columns = shap_values.columns
6869

6970
Y = df.copy()
7071
for c in shap_columns:
7172
Y[c] = list(shap_values[c])
7273

73-
7474
return Y
7575

76-
7776
def kernel_shap(self, model, X_train):
7877
# use Kernel SHAP to explain test set predictions
7978
explainer = shap.KernelExplainer(model.predict_proba, X_train)
@@ -88,15 +87,13 @@ def kernel_shap(self, model, X_train):
8887
shap_columns.append(i + "_impact")
8988
pd_shap.columns = shap_columns
9089

91-
92-
9390
Y = X_train.copy()
9491
for c in shap_columns:
9592
Y[c] = list(pd_shap[c])
9693

9794
return Y, explainer
9895

99-
def kernel_shap_classification(self, model, X_train,prediction_col):
96+
def kernel_shap_classification(self, model, X_train, prediction_col):
10097
# use Kernel SHAP to explain test set predictions
10198
explainer = shap.KernelExplainer(model.predict_proba, X_train)
10299
shap_values = explainer.shap_values(X_train, nsamples=100)
@@ -110,41 +107,50 @@ def kernel_shap_classification(self, model, X_train,prediction_col):
110107
shap_columns.append(i + "_impact")
111108
pd_shap.columns = shap_columns
112109

113-
114-
115110
Y = X_train.copy()
116111
for c in shap_columns:
117112
Y[c] = list(pd_shap[c])
118113

119114
return Y, explainer
120115

121-
def select_row_shap_values(self, shap_values,prediction_col):
116+
def h2o_shap_results(self, model, h2o_df, X_train, prediction_col):
117+
118+
pd_shap = model.predict_contributions(h2o_df)
119+
120+
pd_shap = pd_shap.as_data_frame()
121+
pd_shap.drop(['BiasTerm'],axis=1,inplace=True,errors='ignore')
122+
shap_columns = [i + "_impact" for i in pd_shap.columns]
123+
pd_shap.columns = shap_columns
124+
print(shap_columns)
125+
y = X_train.copy()
126+
for c in shap_columns:
127+
y[c] = list(pd_shap[c])
128+
return y, pd_shap
129+
130+
def select_row_shap_values(self, shap_values, prediction_col):
122131

123132
num_of_classes = len(shap_values)
124133

125-
if num_of_classes== len(prediction_col):
134+
if num_of_classes == len(prediction_col):
126135
df_final = pd.DataFrame(shap_values)
127136
return df_final
128137

129-
point_no=0
138+
point_no = 0
130139
df_array = []
131140
for p in prediction_col:
132141
df_array.append(shap_values[p][point_no])
133-
point_no=point_no+1
142+
point_no = point_no + 1
134143

135144
df_final = pd.DataFrame(df_array)
136145
return df_final
137146

138-
139-
def randomforest_shap_classification(self, model, X,prediction_col):
147+
def randomforest_shap_classification(self, model, X, prediction_col):
140148
explainer = shap.TreeExplainer(model)
141-
shap_values = explainer.shap_values(X,approximate=True)
142-
149+
shap_values = explainer.shap_values(X, approximate=True)
143150

144-
pd_shap = self.select_row_shap_values(shap_values,prediction_col)
151+
pd_shap = self.select_row_shap_values(shap_values, prediction_col)
145152
all_columns = list(X.columns)
146153

147-
148154
pd_shap.columns = [f"{y}_impact" for y in all_columns]
149155

150156
shap_columns = pd_shap.columns
@@ -153,19 +159,15 @@ def randomforest_shap_classification(self, model, X,prediction_col):
153159
for c in shap_columns:
154160
Y[c] = list(pd_shap[c])
155161

156-
157162
return Y, explainer
158163

159-
160164
def randomforest_shap(self, model, X):
161165
explainer = shap.TreeExplainer(model)
162-
shap_values = explainer.shap_values(X,approximate=True)
163-
166+
shap_values = explainer.shap_values(X, approximate=True)
164167

165168
pd_shap = pd.DataFrame(shap_values)
166169
all_columns = list(X.columns)
167170

168-
169171
pd_shap.columns = [f"{y}_impact" for y in all_columns]
170172

171173
shap_columns = pd_shap.columns
@@ -174,10 +176,8 @@ def randomforest_shap(self, model, X):
174176
for c in shap_columns:
175177
Y[c] = list(pd_shap[c])
176178

177-
178179
return Y, explainer
179180

180-
181181
def get_shap_values(self, x_array, model, x_variable, cat_index):
182182
"""
183183
SHAP VALUES CALCULATED
@@ -191,78 +191,77 @@ def get_shap_values(self, x_array, model, x_variable, cat_index):
191191
shap_values = pd.DataFrame(data=shap_values, columns=total_columns)
192192
return shap_values
193193

194-
def find(self, model, df,prediction_col,is_classification, model_name="xgboost"):
194+
def find(self, model, df, prediction_col, is_classification, model_name="xgboost"):
195195

196196
if model_name == "xgboost":
197-
df2 , explainer= self.xgboost_shap(model, df)
197+
df2, explainer = self.xgboost_shap(model, df)
198198
return df2, explainer
199199

200200
elif model_name == "lightgbm":
201-
df2 , explainer= self.xgboost_shap(model, df)
201+
df2, explainer = self.xgboost_shap(model, df)
202202

203203
return df2, explainer
204204

205205
elif model_name == "catboost":
206206
df2 = self.catboost_shap(model, df)
207-
explainer= None
207+
explainer = None
208208
return df2, explainer
209209

210+
elif model_name == 'h2o':
211+
df2, explainer = self.h2o_shap_results(model, df, df.as_data_frame(), prediction_col)
212+
return df2, explainer
210213

211214
elif model_name == "randomforest":
212215
if is_classification:
213-
df2 , explainer= self.randomforest_shap_classification(model, df, prediction_col)
216+
df2, explainer = self.randomforest_shap_classification(model, df, prediction_col)
214217
else:
215-
df2 , explainer= self.randomforest_shap(model, df)
218+
df2, explainer = self.randomforest_shap(model, df)
216219
return df2, explainer
217220

218221
elif model_name == "svm":
219222
if is_classification:
220-
df2 , explainer= self.kernel_shap_classification(model, df,prediction_col)
223+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
221224
else:
222225
df2, explainer = self.kernel_shap(model, df)
223226
return df2, explainer
224227

225228
elif model_name == "knn":
226229
if is_classification:
227-
df2, explainer = self.kernel_shap_classification(model, df,prediction_col)
230+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
228231
else:
229-
df2 , explainer= self.kernel_shap(model, df)
232+
df2, explainer = self.kernel_shap(model, df)
230233
return df2, explainer
231234

232235
elif model_name == "logisticregression":
233236
if is_classification:
234-
df2 , explainer= self.kernel_shap_classification(model, df,prediction_col)
237+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
235238
else:
236239
df2, explainer = self.kernel_shap(model, df)
237240
return df2, explainer
238241

239242
elif model_name == "decisiontree":
240243
if is_classification:
241-
df2, explainer = self.kernel_shap_classification(model, df,prediction_col)
244+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
242245
else:
243246
df2, explainer = self.kernel_shap(model, df)
244247
return df2, explainer
245248

246249
elif model_name == "neuralnetwork":
247250
if is_classification:
248-
df2, explainer = self.kernel_shap_classification(model, df,prediction_col)
251+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
249252
else:
250253
df2, explainer = self.kernel_shap(model, df)
251254
return df2, explainer
252255

253-
elif model_name=="gradientboostingregressor":
254-
df2 , explainer= self.xgboost_shap(model, df)
256+
elif model_name == "gradientboostingregressor":
257+
df2, explainer = self.xgboost_shap(model, df)
255258
return df2, explainer
256259
elif "gradientboosting" in model_name:
257260
df2, explainer = self.xgboost_shap(model, df)
258261
return df2, explainer
259262
else:
260263
if is_classification:
261-
df2 , explainer= self.kernel_shap_classification(model, df,prediction_col)
264+
df2, explainer = self.kernel_shap_classification(model, df, prediction_col)
262265
else:
263-
df2 , explainer= self.kernel_shap(model, df)
266+
df2, explainer = self.kernel_shap(model, df)
264267
return df2, explainer
265-
266-
267-
268-

0 commit comments

Comments
 (0)