Merge pull request #29 from satra/enh-pipeline

satra · web-flow · commit a44e5356da47 · 2020-06-23T10:06:13.000-04:00
Add support for basic scikit-learn pipelines
diff --git a/README.md b/README.md
@@ -11,11 +11,10 @@ scale across a set of classifiers and metrics. It will also use Pydra's caching
 to not redo model training and evaluation when new metrics are added, or when
 number of iterations (`n_splits`) is increased.
 
-Upcoming features:
-1. Improve output report containing [SHAP](https://github.com/slundberg/shap)
+1. Output report contains [SHAP](https://github.com/slundberg/shap)
   feature analysis.
-2. Allow for comparing scikit-learn pipelines.
-3. Test on scikit-learn compatible classifiers
+2. Allows for comparing *some* scikit-learn pipelines in addition to base
+  classifiers.
 
 ### Installation
 
@@ -109,6 +108,16 @@ This is a list of classifiers from scikit learn and uses an array to encode:
 when param grid is provided and default classifier parameters are not changed,
 then an empty dictionary **MUST** be provided as parameter 3.
 
+This can also be embedded as a list indicating a scikit-learn Pipeline. For
+example:
+
+```
+ [ ["sklearn.impute", "SimpleImputer"],
+   ["sklearn.preprocessing", "StandardScaler"],
+   ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]
+  ]
+```
+
 ## Example specification:
 
 ```
@@ -121,17 +130,17 @@ then an empty dictionary **MUST** be provided as parameter 3.
  "test_size": 0.2,
  "clf_info": [
  ["sklearn.ensemble", "AdaBoostClassifier"],
- ["sklearn.naive_bayes", "GaussianNB"],
  ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}],
- ["sklearn.ensemble", "RandomForestClassifier", {"n_estimators": 100}],
- ["sklearn.ensemble", "ExtraTreesClassifier", {"n_estimators": 100, "class_weight": "balanced"}],
- ["sklearn.linear_model", "LogisticRegressionCV", {"solver": "liblinear", "penalty": "l1"}],
  ["sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}],
  ["sklearn.svm", "SVC", {"probability": true},
   [{"kernel": ["rbf", "linear"], "C": [1, 10, 100, 1000]}]],
  ["sklearn.neighbors", "KNeighborsClassifier", {},
   [{"n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19],
-    "weights": ["uniform", "distance"]}]]
+    "weights": ["uniform", "distance"]}]],
+ [ ["sklearn.impute", "SimpleImputer"],
+   ["sklearn.preprocessing", "StandardScaler"],
+   ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]
+  ]
  ],
  "permute": [true, false],
  "gen_shap": true,
diff --git a/long-spec.json.sample b/long-spec.json.sample
@@ -8,7 +8,9 @@
  "clf_info": [
  ["sklearn.ensemble", "AdaBoostClassifier"],
  ["sklearn.naive_bayes", "GaussianNB"],
- ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}],
+ [ ["sklearn.impute", "SimpleImputer"],
+   ["sklearn.preprocessing", "StandardScaler"],
+   ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]],
  ["sklearn.ensemble", "RandomForestClassifier", {"n_estimators": 100}],
  ["sklearn.ensemble", "ExtraTreesClassifier", {"n_estimators": 100, "class_weight": "balanced"}],
  ["sklearn.linear_model", "LogisticRegressionCV", {"solver": "liblinear", "penalty": "l1"}],
diff --git a/pydra_ml/report.py b/pydra_ml/report.py
@@ -87,7 +87,10 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16):
     indexes_all = {}
 
     for model_results in results:
-        model_name = model_results[0].get("ml_wf.clf_info")[1]
+        model_name = model_results[0].get("ml_wf.clf_info")
+        if isinstance(model_name[0], list):
+            model_name = model_name[-1]
+        model_name = model_name[1]
         indexes_all[model_name] = []
         shaps = model_results[
             1
@@ -179,7 +182,10 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16):
     indexes_all = {}
 
     for model_results in results:
-        model_name = model_results[0].get("ml_wf.clf_info")[1]
+        model_name = model_results[0].get("ml_wf.clf_info")
+        if isinstance(model_name[0], list):
+            model_name = model_name[-1]
+        model_name = model_name[1]
         indexes_all[model_name] = []
         shaps = model_results[
             1
@@ -308,7 +314,17 @@ def gen_report(
         score = val[1].output.score
         if not isinstance(score, list):
             score = [score]
-        name = val[0][prefix + ".clf_info"][1].split("Classifier")[0]
+
+        clf = val[0][prefix + ".clf_info"]
+        if isinstance(clf[0], list):
+            clf = clf[-1][1]
+        else:
+            clf = clf[1]
+        if "Classifier" in clf:
+            name = clf.split("Classifier")[0]
+        else:
+            name = clf.split("Regressor")[0]
+        name = name.split("CV")[0]
         permute = val[0][prefix + ".permute"]
         for scoreval in score:
             for idx, metric in enumerate(metrics):
diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -62,21 +62,35 @@ def train_test_kernel(X, y, train_test_split, split_index, clf_info, permute):
     :param permute: whether to run it in permuted mode or not
     :return: outputs, trained classifier with sample indices
     """
-    from sklearn.preprocessing import StandardScaler
     from sklearn.pipeline import Pipeline
     import numpy as np
 
-    mod = __import__(clf_info[0], fromlist=[clf_info[1]])
-    params = {}
-    if len(clf_info) > 2:
-        params = clf_info[2]
-    clf = getattr(mod, clf_info[1])(**params)
-    if len(clf_info) == 4:
-        from sklearn.model_selection import GridSearchCV
+    def to_instance(clf_info):
+        mod = __import__(clf_info[0], fromlist=[clf_info[1]])
+        params = {}
+        if len(clf_info) > 2:
+            params = clf_info[2]
+        clf = getattr(mod, clf_info[1])(**params)
+        if len(clf_info) == 4:
+            from sklearn.model_selection import GridSearchCV
+
+            clf = GridSearchCV(clf, param_grid=clf_info[3])
+        return clf
+
+    if isinstance(clf_info[0], list):
+        # Process as a pipeline constructor
+        steps = []
+        for val in clf_info:
+            step = to_instance(val)
+            steps.append((val[1], step))
+        pipe = Pipeline(steps)
+    else:
+        clf = to_instance(clf_info)
+        from sklearn.preprocessing import StandardScaler
+
+        pipe = Pipeline([("std", StandardScaler()), (clf_info[1], clf)])
 
-        clf = GridSearchCV(clf, param_grid=clf_info[3])
     train_index, test_index = train_test_split[split_index]
-    pipe = Pipeline([("std", StandardScaler()), (clf_info[1], clf)])
     y = y.ravel()
     if permute:
         pipe.fit(X[train_index], y[np.random.permutation(train_index)])
diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py
@@ -5,7 +5,11 @@
 def test_classifier(tmpdir):
     clfs = [
         ("sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}),
-        ("sklearn.naive_bayes", "GaussianNB", {}),
+        [
+            ["sklearn.impute", "SimpleImputer"],
+            ["sklearn.preprocessing", "StandardScaler"],
+            ["sklearn.naive_bayes", "GaussianNB", {}],
+        ],
     ]
     csv_file = os.path.join(os.path.dirname(__file__), "data", "breast_cancer.csv")
     inputs = {
@@ -32,7 +36,11 @@ def test_classifier(tmpdir):
 
 def test_regressor(tmpdir):
     clfs = [
-        ("sklearn.neural_network", "MLPRegressor", {"alpha": 1, "max_iter": 1000}),
+        [
+            ["sklearn.impute", "SimpleImputer"],
+            ["sklearn.preprocessing", "StandardScaler"],
+            ["sklearn.neural_network", "MLPRegressor", {"alpha": 1, "max_iter": 1000}],
+        ],
         (
             "sklearn.linear_model",
             "LinearRegression",
@@ -58,6 +66,6 @@ def test_regressor(tmpdir):
 
     wf = gen_workflow(inputs, cache_dir=tmpdir)
     results = run_workflow(wf, "cf", {"n_procs": 1})
-    assert results[0][0]["ml_wf.clf_info"][1] == "MLPRegressor"
+    assert results[0][0]["ml_wf.clf_info"][-1][1] == "MLPRegressor"
     assert results[0][0]["ml_wf.permute"]
     assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
diff --git a/short-spec.json.sample b/short-spec.json.sample
@@ -7,7 +7,10 @@
  "test_size": 0.2,
  "clf_info": [
   ["sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}],
-  ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]
+  [ ["sklearn.impute", "SimpleImputer"],
+    ["sklearn.preprocessing", "StandardScaler"],
+    ["sklearn.tree", "DecisionTreeClassifier", {"max_depth": 5}]
+  ]
  ],
  "permute": [false, true],
  "gen_shap": true,