|
1 | 1 | import os |
2 | 2 | import sys |
| 3 | + |
| 4 | +import re |
| 5 | + |
| 6 | + |
3 | 7 | from pathlib import Path |
4 | 8 | from sys import platform |
5 | 9 | import subprocess |
| 10 | +import time |
6 | 11 |
|
7 | 12 | path= Path(__file__).parent.absolute() |
8 | 13 | path_dataset= os.path.join(path, "datasets") |
@@ -131,6 +136,97 @@ def ai(self, df, y, model, model_name="xgboost", mode=None): |
131 | 136 |
|
132 | 137 | return True |
133 | 138 |
|
| 139 | + |
| 140 | + def ai_test(self, df, y, model, model_name="xgboost", mode=None): |
| 141 | + y_variable= "y_actual" |
| 142 | + y_variable_predict= "y_prediction" |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | + prediction_col=[] |
| 147 | + |
| 148 | + if model_name == "xgboost": |
| 149 | + import xgboost |
| 150 | + if xgboost.__version__ in ['1.1.0', '1.1.1', '1.1.0rc2', '1.1.0rc1']: |
| 151 | + print("Current Xgboost version is not supported. Please install Xgboost using 'pip install xgboost==1.0.2'") |
| 152 | + return False |
| 153 | + prediction_col = model.predict(xgboost.DMatrix(df)) |
| 154 | + |
| 155 | + elif model_name == "catboost": |
| 156 | + prediction_col = model.predict(df.to_numpy()) |
| 157 | + |
| 158 | + else: |
| 159 | + prediction_col = model.predict(df.to_numpy()) |
| 160 | + |
| 161 | + # is classification? |
| 162 | + is_classification = self.is_classification_given_y_array(prediction_col) |
| 163 | + |
| 164 | + |
| 165 | + |
| 166 | + #shap |
| 167 | + c = calculate_shap() |
| 168 | + self.df_final, self.explainer = c.find(model, df, prediction_col, is_classification, model_name=model_name) |
| 169 | + |
| 170 | + #prediction col |
| 171 | + self.df_final[y_variable_predict] = prediction_col |
| 172 | + |
| 173 | + |
| 174 | + |
| 175 | + self.df_final[y_variable] = y |
| 176 | + |
| 177 | + |
| 178 | + #additional inputs. |
| 179 | + if is_classification==True: |
| 180 | + # find and add probabilities in the dataset. |
| 181 | + prediction_col_prob = model.predict_proba(df.to_numpy()) |
| 182 | + pd_prediction_col_prob = pd.DataFrame(prediction_col_prob) |
| 183 | + |
| 184 | + for c in pd_prediction_col_prob.columns: |
| 185 | + self.df_final["probability_of_predicting_class_" + str(c)] = list(pd_prediction_col_prob[c]) |
| 186 | + |
| 187 | + classes = [] |
| 188 | + for c in pd_prediction_col_prob.columns: |
| 189 | + classes.append(str(c)) |
| 190 | + self.param["classes"]=classes |
| 191 | + |
| 192 | + try: |
| 193 | + expected_values_by_class = self.explainer.expected_value |
| 194 | + except: |
| 195 | + expected_values_by_class=[] |
| 196 | + for c in range(len(classes)): |
| 197 | + expected_values_by_class.append(1/len(classes)) |
| 198 | + |
| 199 | + |
| 200 | + self.param["expected_values"]= expected_values_by_class |
| 201 | + else: |
| 202 | + try: |
| 203 | + expected_values = self.explainer.expected_value |
| 204 | + self.param["expected_values"] = [expected_values] |
| 205 | + except: |
| 206 | + expected_value = [round(np.array(y).mean(),2)] |
| 207 | + self.param["expected_values"] = expected_value |
| 208 | + |
| 209 | + |
| 210 | + self.param["is_classification"]= is_classification |
| 211 | + self.param["model_name"]= model_name |
| 212 | + self.param["model"]= model |
| 213 | + self.param["columns"]= df.columns |
| 214 | + self.param["y_variable"]= y_variable |
| 215 | + self.param["y_variable_predict"]= y_variable_predict |
| 216 | + |
| 217 | + |
| 218 | + |
| 219 | + # manually test all the graphs to see if all work |
| 220 | + |
| 221 | + g = plotly_graphs() |
| 222 | + |
| 223 | + __, df2 = g.feature_importance(self.df_final) |
| 224 | + fim, df2 = g.feature_impact(self.df_final) |
| 225 | + sp = g.summary_plot(self.df_final) |
| 226 | + |
| 227 | + |
| 228 | + return True |
| 229 | + |
134 | 230 | def dataset_boston(self): |
135 | 231 | # load JS visualization code to notebook |
136 | 232 | shap.initjs() |
@@ -195,3 +291,6 @@ def run_command(command): |
195 | 291 |
|
196 | 292 |
|
197 | 293 |
|
| 294 | + |
| 295 | + |
| 296 | + |
0 commit comments