Merge pull request #59 from nipype/enh/pydra23

satra · web-flow · commit 64e351eb1941 · 2024-02-24T16:16:50.000-05:00
initial attempt to work with pydra 0.23+
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,9 @@ dmypy.json
 
 # pycharm
 .idea/
+
+# Venvs
+*.venv
+
+# Generated messages
+/messages
diff --git a/pydra_ml/__init__.py b/pydra_ml/__init__.py
@@ -35,3 +35,7 @@ def set_logger_level(lgr, level):
 set_logger_level(lgr, os.environ.get("PYDRAML_LOG_LEVEL", logging.INFO))
 FORMAT = "%(asctime)-15s [%(levelname)8s] %(message)s"
 logging.basicConfig(format=FORMAT)
+
+from . import _version
+
+__version__ = _version.get_versions()["version"]
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         messengers=FileMessenger(),
         messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
     )
-    wf.split(["clf_info", "permute"])
+    wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])
     wf.add(
         read_file_pdt(
             name="readcsv",
@@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
             permute=wf.lzin.permute,
         )
     )
-    wf.fit_clf.split("split_index")
+    wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)
     wf.add(
         calc_metric_pdt(
             name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics
diff --git a/pydra_ml/report.py b/pydra_ml/report.py
@@ -3,12 +3,15 @@
 import pickle
 import warnings
 
+import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from sklearn.metrics import accuracy_score, explained_variance_score
 
+matplotlib.use("Agg")
+
 
 def save_obj(obj, path):
     with open(path, "wb") as f:
@@ -97,9 +100,9 @@ def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap
     # plot without all bootstrapping values
     summary = summary[["mean", "std", "min", "max"]]
     num_features = len(list(summary.index))
-    if (plot_top_n_shap != 1 and type(plot_top_n_shap) == float) or type(
+    if (plot_top_n_shap != 1 and type(plot_top_n_shap) is float) or type(
         plot_top_n_shap
-    ) == int:
+    ) is int:
         # if plot_top_n_shap != 1.0 but includes 1 (int)
         if plot_top_n_shap <= 0:
             raise ValueError(
@@ -223,7 +226,7 @@ def gen_report_shap_class(results, output_dir="./", plot_top_n_shap=16):
                         f"""There were no {quadrant.upper()}s, this will output NaNs
                         in the csv and figure for this split column"""
                     )
-                shaps_i_quadrant = shaps_i[
+                shaps_i_quadrant = np.array(shaps_i)[
                     indexes.get(quadrant)
                 ]  # shape (P, F) P prediction x F feature_names
                 abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
@@ -325,7 +328,7 @@ def gen_report_shap_regres(results, output_dir="./", plot_top_n_shap=16):
                         f"""There were no {quadrant.upper()}s, this will
                         output NaNs in the csv and figure for this split column"""
                     )
-                shaps_i_quadrant = shaps_i[
+                shaps_i_quadrant = np.array(shaps_i)[
                     indexes.get(quadrant)
                 ]  # shape (P, F) P prediction x F feature_names
                 abs_weighted_shap_values = np.abs(shaps_i_quadrant) * split_performance
diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -1,5 +1,15 @@
 #!/usr/bin/env python
 
+import typing as ty
+
+from pydra.utils.hash import Cache, register_serializer
+from sklearn.pipeline import Pipeline
+
+
+@register_serializer
+def bytes_repr_Pipeline(obj: Pipeline, cache: Cache):
+    yield str(obj).encode()
+
 
 def read_file(filename, x_indices=None, target_vars=None, group=None):
     """Read a CSV data file
@@ -92,7 +102,7 @@ def to_instance(clf_info):
 
     train_index, test_index = train_test_split[split_index]
     y = y.ravel()
-    if type(X[0][0]) == str:
+    if type(X[0][0]) is str:
         # it's loaded as bytes, so we need to decode as utf-8
         X = np.array([str.encode(n[0]).decode("utf-8") for n in X])
     if permute:
@@ -126,7 +136,27 @@ def calc_metric(output, metrics):
     return score, output
 
 
-def get_feature_importance(permute, model, gen_feature_importance=True):
+def get_feature_importance(
+    *,
+    permute: bool,
+    model: ty.Tuple[Pipeline, list, list],
+    gen_feature_importance: bool = True,
+):
+    """Compute feature importance for the model
+
+    Parameters
+    ----------
+    permute : bool
+        Whether or not to run the model in permuted mode
+    model : tuple(sklearn.pipeline.Pipeline, list, list)
+        The model to compute feature importance for
+    gen_feature_importance : bool
+        Whether or not to generate the feature importance
+    Returns
+    -------
+    list
+        List of feature importance
+    """
     if permute or not gen_feature_importance:
         return []
     pipeline, train_index, test_index = model
@@ -172,7 +202,7 @@ def get_feature_importance(permute, model, gen_feature_importance=True):
                 pipeline_steps.coefs_
                 pipeline_steps.coef_
 
-                Please add correct method in tasks.py or if inexistent,
+                Please add correct method in tasks.py or if non-existent,
                 set gen_feature_importance to false in the spec file.
 
                 This is the error that was returned by sklearn:\n\t{e}\n
@@ -224,7 +254,9 @@ def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
     import shap
 
     explainer = shap.KernelExplainer(pipe.predict, shap.kmeans(X[train_index], 5))
-    shaps = explainer.shap_values(X[test_index], nsamples=nsamples, l1_reg=l1_reg)
+    shaps = explainer.shap_values(
+        X[test_index], nsamples=nsamples, l1_reg=l1_reg, silent=True
+    )
     return shaps
 
 
diff --git a/setup.cfg b/setup.cfg
@@ -26,7 +26,7 @@ classifiers =
 [options]
 python_requires = >= 3.8
 install_requires =
-    pydra == 0.22.0
+    pydra >= 0.23.0-alpha
     psutil
     scikit-learn
     seaborn
@@ -35,11 +35,9 @@ install_requires =
 
 test_requires =
     pytest >= 4.4.0
-    pytest-cov
     pytest-env
     pytest-xdist
     pytest-rerunfailures
-    codecov
 packages = find:
 include_package_data = True
 
@@ -58,11 +56,9 @@ docs =
     %(doc)s
 test =
     pytest >= 4.4.0
-    pytest-cov
     pytest-env
     pytest-xdist
     pytest-rerunfailures
-    codecov
 tests =
     %(test)s
 dev =

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):`
`71`	`71`	`messengers=FileMessenger(),`
`72`	`72`	`messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},`
`73`	`73`	`)`
`74`		`- wf.split(["clf_info", "permute"])`
	`74`	`+ wf.split(clf_info=inputs["clf_info"], permute=inputs["permute"])`
`75`	`75`	`wf.add(`
`76`	`76`	`read_file_pdt(`
`77`	`77`	`name="readcsv",`
`@@ -102,7 +102,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):`
`102`	`102`	`permute=wf.lzin.permute,`
`103`	`103`	`)`
`104`	`104`	`)`
`105`		`- wf.fit_clf.split("split_index")`
	`105`	`+ wf.fit_clf.split(split_index=wf.gensplit.lzout.split_indices)`
`106`	`106`	`wf.add(`
`107`	`107`	`calc_metric_pdt(`
`108`	`108`	`name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics`