Skip to content

Commit f6a478d

Browse files
jarverhajarverha
authored andcommitted
added support and tests for pipeline explainer
1 parent 894c644 commit f6a478d

File tree

4 files changed

+327
-0
lines changed

4 files changed

+327
-0
lines changed

powershap/shap_wrappers/shap_explainer.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from numpy.random import RandomState
1313
from sklearn.model_selection import train_test_split
1414
from sklearn.utils.validation import validate_data
15+
import ShapExplainerFactory
1516

1617
from sklearn.utils._tags import (
1718
ClassifierTags,
@@ -367,6 +368,62 @@ def _fit_get_shap(self, X_train, Y_train, X_val, Y_val, random_seed, **kwargs) -
367368
C_explainer = shap.explainers.Linear(PowerShap_model, X_train)
368369
return C_explainer.shap_values(X_val)
369370

371+
# This support an Sklearn Pipeline Explainer, which will be a wrapper around a ShapExplainer
372+
class PipelineExplainer(ShapExplainer):
373+
374+
def __init__(self, model: Any):
375+
"""Create a Powershap explainer instance.
376+
377+
Parameters
378+
----------
379+
model: Any
380+
The model from which powershap will use its shap values to perform feature
381+
selection.
382+
383+
"""
384+
assert self.supports_model(model)
385+
self.shap_explainer = ShapExplainerFactory.get_explainer(model=ShapExplainer(model.steps[-1][1]))
386+
387+
@staticmethod
388+
def supports_model(model) -> bool:
389+
from sklearn.pipeline import Pipeline
390+
391+
return isinstance(model, Pipeline)
392+
393+
def _fit_get_shap(self, X_train, Y_train, X_val, Y_val, random_seed, **kwargs) -> np.array:
394+
from sklearn.base import clone
395+
from sklearn.pipeline import Pipeline
396+
397+
# Because the ShapExplainer behavior is different for each model, we extract the model and only keep the preprocessing pipeline
398+
powershap_pipeline = clone(Pipeline(self.model.steps[:-1]))
399+
400+
# 2. Build the parameter dictionary to set the random states to the random seed
401+
params_to_set = {}
402+
for step_name, step_estimator in powershap_pipeline.steps:
403+
if 'random_state' in step_estimator.get_params():
404+
# Format: 'step_name__parameter_name'
405+
params_to_set[f'{step_name}__random_state'] = random_seed
406+
407+
# 3. Apply the parameters to the cloned pipeline
408+
powershap_pipeline.set_params(**params_to_set)
409+
410+
# We fit the pipeline here to be used to transform the data
411+
powershap_pipeline.fit(X_train, Y_train, **kwargs)
412+
413+
# Get the transformed data from all the preceding steps
414+
transformed_X_train = powershap_pipeline.transform(X_train)
415+
transformed_X_val = powershap_pipeline.transform(X_val)
416+
417+
return self.shap_explainer._fit_get_shap(transformed_X_train, Y_train, transformed_X_val, Y_val, random_seed, **kwargs)
418+
419+
420+
def validate_data(self, _estimator, X, y, **kwargs):
421+
# The assumption here is that the used model is the limiting factor for validation of the data
422+
self.shap_explainer.validate_data(_estimator, X, y, **kwargs)
423+
424+
def _get_more_tags(self):
425+
return self.shap_explainer._get_more_tags()
426+
370427

371428
### DEEP LEARNING
372429

powershap/shap_wrappers/shap_explainer_factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
LinearExplainer,
1111
ShapExplainer,
1212
XGBoostExplainer,
13+
PipelineExplainer
1314
)
1415

1516

@@ -23,6 +24,7 @@ class ShapExplainerFactory:
2324
EnsembleExplainer,
2425
LinearExplainer,
2526
DeepLearningExplainer,
27+
PipelineExplainer
2628
]
2729

2830
@classmethod

tests/test_pipeline_powershap.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
__author__ = "Jeroen Van Der Donckt, Jarne Verhaeghe"
2+
3+
import numpy as np
4+
import pandas as pd
5+
from catboost import CatBoostClassifier, CatBoostRegressor
6+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
7+
8+
from powershap import PowerShap
9+
10+
from .conftest import dummy_classification, dummy_regression
11+
from sklearn.pipeline import Pipeline
12+
from sklearn.pipeline import make_pipeline
13+
from sklearn.preprocessing import FunctionTransformer
14+
15+
def test_pipeline_catboost_class_powershap(dummy_classification):
16+
X, y = dummy_classification
17+
n_informative = sum([c.startswith("informative") for c in X.columns])
18+
assert n_informative > 0, "No informative columns in the dummy data!"
19+
20+
DummyScaler = FunctionTransformer(lambda x: x)
21+
22+
selector = PowerShap(
23+
model=make_pipeline(DummyScaler, CatBoostClassifier(n_estimators=250, verbose=0)), power_iterations=15, automatic=False
24+
)
25+
26+
selector.fit(X, y)
27+
selected_feats = selector.transform(X)
28+
29+
assert len(selected_feats.columns) == n_informative
30+
assert all([c.startswith("informative") for c in selected_feats.columns])
31+
32+
33+
def test_pipeline_catboost_regr_powershap(dummy_regression):
34+
X, y = dummy_regression
35+
n_informative = sum([c.startswith("informative") for c in X.columns])
36+
assert n_informative > 0, "No informative columns in the dummy data!"
37+
38+
DummyScaler = FunctionTransformer(lambda x: x)
39+
40+
selector = PowerShap(
41+
model=make_pipeline(DummyScaler, CatBoostRegressor(n_estimators=250, verbose=0)), power_iterations=15, automatic=False
42+
)
43+
44+
selector.fit(X, y)
45+
selected_feats = selector.transform(X)
46+
47+
assert len(selected_feats.columns) == n_informative
48+
assert all([c.startswith("informative") for c in selected_feats.columns])
49+
50+
51+
def test_pipeline_catboost_handle_nans(dummy_classification):
52+
X, y = dummy_classification
53+
X.iloc[:5] = None
54+
X["nan_col"] = None
55+
assert np.any(pd.isna(X))
56+
n_informative = sum([c.startswith("informative") for c in X.columns])
57+
assert n_informative > 0, "No informative columns in the dummy data!"
58+
59+
DummyScaler = FunctionTransformer(lambda x: x)
60+
61+
selector = PowerShap(
62+
model=make_pipeline(DummyScaler, CatBoostClassifier(n_estimators=10, verbose=0)), power_iterations=15
63+
)
64+
65+
selector.fit(X, y)
66+
selected_feats = selector.transform(X)
67+
68+
assert len(selected_feats.columns) == n_informative
69+
assert all([c.startswith("informative") for c in selected_feats.columns])
70+
71+
72+
def test_pipeline_catboost_handle_infs(dummy_classification):
73+
X, y = dummy_classification
74+
X.iloc[:5] = np.Inf
75+
X["inf_col"] = np.Inf
76+
assert np.any(X.isin([np.inf, -np.inf]))
77+
n_informative = sum([c.startswith("informative") for c in X.columns])
78+
assert n_informative > 0, "No informative columns in the dummy data!"
79+
80+
DummyScaler = FunctionTransformer(lambda x: x)
81+
82+
selector = PowerShap(
83+
model=make_pipeline(DummyScaler, CatBoostClassifier(n_estimators=10, verbose=0)), power_iterations=15
84+
)
85+
86+
selector.fit(X, y)
87+
selected_feats = selector.transform(X)
88+
89+
assert len(selected_feats.columns) == n_informative
90+
assert all([c.startswith("informative") for c in selected_feats.columns])
91+
92+
93+
def test_pipeline_catboost_handle_infs_nans(dummy_classification):
94+
X, y = dummy_classification
95+
X.iloc[:5] = np.Inf
96+
X.iloc[5:10] = None
97+
X["inf_col"] = np.Inf
98+
X["nan_col"] = None
99+
assert np.any(X.isin([np.inf, -np.inf]))
100+
assert np.any(pd.isna(X))
101+
n_informative = sum([c.startswith("informative") for c in X.columns])
102+
assert n_informative > 0, "No informative columns in the dummy data!"
103+
104+
DummyScaler = FunctionTransformer(lambda x: x)
105+
106+
selector = PowerShap(
107+
model=make_pipeline(DummyScaler, CatBoostClassifier(n_estimators=10, verbose=0)), power_iterations=15
108+
)
109+
110+
selector.fit(X, y)
111+
selected_feats = selector.transform(X)
112+
113+
assert len(selected_feats.columns) == n_informative
114+
assert all([c.startswith("informative") for c in selected_feats.columns])
115+
116+
117+
def test_pipeline_catboost_handle_strings(dummy_classification):
118+
X, y = dummy_classification
119+
X["cat"] = "miauw"
120+
n_informative = sum([c.startswith("informative") for c in X.columns])
121+
assert n_informative > 0, "No informative columns in the dummy data!"
122+
123+
DummyScaler = FunctionTransformer(lambda x: x)
124+
125+
selector = PowerShap(
126+
model=make_pipeline(DummyScaler,CatBoostClassifier(n_estimators=30, verbose=0, cat_features=[X.shape[1] - 1])),
127+
power_iterations=15
128+
)
129+
130+
selector.fit(X, y)
131+
selected_feats = selector.transform(X)
132+
133+
assert len(selected_feats.columns) == n_informative
134+
assert all([c.startswith("informative") for c in selected_feats.columns])
135+
136+
137+
def test_pipeline_ensemble_class_powershap(dummy_classification):
138+
X, y = dummy_classification
139+
n_informative = sum([c.startswith("informative") for c in X.columns])
140+
assert n_informative > 0, "No informative columns in the dummy data!"
141+
142+
DummyScaler = FunctionTransformer(lambda x: x)
143+
144+
selector = PowerShap(
145+
model=make_pipeline(DummyScaler,RandomForestClassifier(n_estimators=25)), power_iterations=15, automatic=False
146+
)
147+
148+
selector.fit(X, y)
149+
selected_feats = selector.transform(X)
150+
151+
assert len(selected_feats.columns) >= n_informative
152+
assert sum([c.startswith("informative") for c in selected_feats.columns]) == n_informative
153+
154+
155+
def test_pipeline_ensemble_regr_powershap(dummy_regression):
156+
X, y = dummy_regression
157+
n_informative = sum([c.startswith("informative") for c in X.columns])
158+
assert n_informative > 0, "No informative columns in the dummy data!"
159+
160+
DummyScaler = FunctionTransformer(lambda x: x)
161+
162+
selector = PowerShap(
163+
model=make_pipeline(DummyScaler,RandomForestRegressor(n_estimators=25)), power_iterations=15, automatic=False
164+
)
165+
166+
selector.fit(X, y)
167+
selected_feats = selector.transform(X)
168+
169+
assert len(selected_feats.columns) >= n_informative
170+
assert sum([c.startswith("informative") for c in selected_feats.columns]) == n_informative
171+
172+
def test_pipeline_catboost_class_standardscaler_powershap(dummy_classification):
173+
from sklearn.preprocessing import StandardScaler
174+
175+
X, y = dummy_classification
176+
n_informative = sum([c.startswith("informative") for c in X.columns])
177+
assert n_informative > 0, "No informative columns in the dummy data!"
178+
179+
selector = PowerShap(
180+
model=make_pipeline(StandardScaler, CatBoostClassifier(n_estimators=250, verbose=0)), power_iterations=15, automatic=False
181+
)
182+
183+
selector.fit(X, y)
184+
selected_feats = selector.transform(X)
185+
186+
assert len(selected_feats.columns) == n_informative
187+
assert all([c.startswith("informative") for c in selected_feats.columns])
188+
189+
def test_pipeline_catboost_class_maxabsscalerr_robustscaler_powershap(dummy_classification):
190+
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
191+
192+
X, y = dummy_classification
193+
n_informative = sum([c.startswith("informative") for c in X.columns])
194+
assert n_informative > 0, "No informative columns in the dummy data!"
195+
196+
selector = PowerShap(
197+
model=make_pipeline(MaxAbsScaler, RobustScaler, CatBoostClassifier(n_estimators=250, verbose=0)), power_iterations=15, automatic=False
198+
)
199+
200+
selector.fit(X, y)
201+
selected_feats = selector.transform(X)
202+
203+
assert len(selected_feats.columns) == n_informative
204+
assert all([c.startswith("informative") for c in selected_feats.columns])

tests/test_shap_explainer.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
LGBMExplainer,
1111
LinearExplainer,
1212
XGBoostExplainer,
13+
PipelineExplainer,
1314
)
1415

1516

@@ -110,6 +111,69 @@ def test_get_ensemble_explainer():
110111
explainer = ShapExplainerFactory.get_explainer(model_class())
111112
assert isinstance(explainer, EnsembleExplainer)
112113

114+
def test_get_pipeline_explainer():
115+
from sklearn.linear_model import (
116+
LinearRegression,
117+
LogisticRegression,
118+
LogisticRegressionCV,
119+
PassiveAggressiveClassifier,
120+
Perceptron,
121+
Ridge,
122+
RidgeClassifier,
123+
RidgeClassifierCV,
124+
RidgeCV,
125+
SGDClassifier,
126+
SGDRegressor,
127+
)
128+
from sklearn.ensemble import (
129+
ExtraTreesClassifier,
130+
ExtraTreesRegressor,
131+
GradientBoostingClassifier,
132+
GradientBoostingRegressor,
133+
RandomForestClassifier,
134+
RandomForestRegressor,
135+
)
136+
from catboost import CatBoostClassifier, CatBoostRegressor
137+
from lightgbm import LGBMClassifier, LGBMRegressor
138+
from xgboost import XGBClassifier, XGBRegressor
139+
140+
model_classes = [
141+
LogisticRegression,
142+
LogisticRegressionCV,
143+
PassiveAggressiveClassifier,
144+
Perceptron,
145+
RidgeClassifier,
146+
RidgeClassifierCV,
147+
SGDClassifier,
148+
LinearRegression,
149+
Ridge,
150+
RidgeCV,
151+
SGDRegressor,
152+
RandomForestClassifier,
153+
GradientBoostingClassifier,
154+
ExtraTreesClassifier,
155+
RandomForestRegressor,
156+
GradientBoostingRegressor,
157+
ExtraTreesRegressor,
158+
XGBClassifier, XGBRegressor,
159+
LGBMClassifier, LGBMRegressor,
160+
CatBoostClassifier, CatBoostRegressor,
161+
]
162+
from sklearn.pipeline import Pipeline
163+
from sklearn.pipeline import make_pipeline
164+
from sklearn.preprocessing import FunctionTransformer
165+
166+
167+
168+
for model_class in model_classes:
169+
DummyScaler = FunctionTransformer(lambda x: x)
170+
171+
make_pipeline(DummyScaler, model_class)
172+
173+
explainer = ShapExplainerFactory.get_explainer(make_pipeline)
174+
assert isinstance(explainer, PipelineExplainer)
175+
176+
113177

114178
# def test_get_deep_learning_explainer():
115179
# import tensorflow as tf

0 commit comments

Comments
 (0)