Merge pull request #15 from big-o/develop

big-o · web-flow · commit 6b849373bdb5 · 2022-08-13T18:40:15.000+01:00
v0.0.3
diff --git a/appveyor.yml b/appveyor.yml
@@ -19,7 +19,6 @@ install: |
   for f in $(find . -maxdepth 1 -name 'requirements*.txt'); do
     pip install -r ${f}
   done
-  pip install pandas  # Needed for some estimator checks.
   pip install .
 
 test_script:
diff --git a/doc/index.rst b/doc/index.rst
@@ -13,24 +13,28 @@ cases including complex pre-processing, model stacking and benchmarking.
    from skdag import DAGBuilder
 
    dag = (
-      DAGBuilder()
+      DAGBuilder(infer_dataframe=True)
       .add_step("impute", SimpleImputer())
-      .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
+      .add_step(
+         "vitals",
+         "passthrough",
+         deps={"impute": ["age", "sex", "bmi", "bp"]},
+      )
       .add_step(
          "blood",
          PCA(n_components=2, random_state=0),
-         deps={"impute": slice(4, 10)}
+         deps={"impute": ["s1", "s2", "s3", "s4", "s5", "s6"]},
       )
       .add_step(
          "rf",
          RandomForestRegressor(max_depth=5, random_state=0),
-         deps=["blood", "vitals"]
+         deps=["blood", "vitals"],
       )
       .add_step("svm", SVR(C=0.7), deps=["blood", "vitals"])
       .add_step(
          "knn",
          KNeighborsRegressor(n_neighbors=5),
-         deps=["blood", "vitals"]
+         deps=["blood", "vitals"],
       )
       .add_step("meta", LinearRegression(), deps=["rf", "svm", "knn"])
       .make_dag(n_jobs=2, verbose=True)
diff --git a/doc/quick_start.rst b/doc/quick_start.rst
@@ -50,31 +50,34 @@ For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
 which allows you to define the graph by specifying the dependencies of each new
 estimator:
 
->>> from skdag import DAGBuilder
->>> dag = (
-...     DAGBuilder()
-...     .add_step("impute", SimpleImputer())
-...     .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
-...     .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
-...     .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
-...     .make_dag()
-... )
->>> dag.draw()
-o    impute
-|\
-o o    blood,vitals
-|/
-o    lr
-<BLANKLINE>
+.. code-block:: python
+
+    >>> from skdag import DAGBuilder
+    >>> dag = (
+    ...     DAGBuilder(infer_dataframe=True)
+    ...     .add_step("impute", SimpleImputer())
+    ...     .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
+    ...     .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
+    ...     .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
+    ...     .make_dag()
+    ... )
+    >>> dag.draw()
+    o    impute
+    |\
+    o o    blood,vitals
+    |/
+    o    lr
+    <BLANKLINE>
 
 .. image:: _static/img/dag2.png
 
 In the above examples we pass the first four columns directly to a regressor, but
 the remaining columns have dimensionality reduction applied first before being
-passed to the same regressor as extra input columns. Note that we can define our graph
-edges in two different ways: as a dict (if we need to select only certain columns from
-the source node) or as a simple list (if we want to simply grab all columns from all
-input nodes).
+passed to the same regressor as extra input columns.
+
+In this DAG, as well as using the ``deps`` option to control which estimators feed in to
+other estimators, but which columns are used (and ignored) by each step. For more detail
+on how to control this behaviour, see the `User Guide <user_guide.html>`_.
 
 The DAG may now be used as an estimator in its own right:
 
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -26,7 +26,8 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
     ...         ("impute", SimpleImputer()),
     ...         ("pca", PCA()),
     ...         ("lr", LogisticRegression())
-    ...     ]
+    ...     ],
+    ...     infer_dataframe=True,
     ... )
 
 You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
@@ -44,18 +45,25 @@ ASCII text:
 
 .. image:: _static/img/dag1.png
 
+Note that we also provided an extra option, ``infer_dataframe``. This is entirely
+optional, but if set the DAG will ensure that dataframe inputs have column and index
+information preserved (or inferred), and the output of the pipeline will also be a
+dataframe. This is useful if you wish to filter down the inputs for one particular step
+to only include certain columns; something we shall see in action later.
+
 For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
 which allows you to define the graph by specifying the dependencies of each new
 estimator:
 
 .. code-block:: python
 
     >>> from skdag import DAGBuilder
+    >>> from sklearn.compose import make_column_selector
     >>> dag = (
-    ...     DAGBuilder()
+    ...     DAGBuilder(infer_dataframe=True)
     ...     .add_step("impute", SimpleImputer())
-    ...     .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
-    ...     .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
+    ...     .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
+    ...     .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": make_column_selector("s[0-9]+")})
     ...     .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
     ...     .make_dag()
     ... )
@@ -73,7 +81,16 @@ the remaining columns have dimensionality reduction applied first before being
 passed to the same regressor. Note that we can define our graph edges in two
 different ways: as a dict (if we need to select only certain columns from the source
 node) or as a simple list (if we want to simply grab all columns from all input
-nodes).
+nodes). Columns may be specified as any kind of iterable (list, slice etc.) or a column
+selector function that conforms to :meth:`sklearn.compose.make_column_selector`.
+
+If you wish to specify string column names for dependencies, ensure you provide the
+``infer_dataframe=True`` option when you create a dag. This will ensure that all
+estimator outputs are coerced into dataframes. Where possible column names will be
+inferred, otherwise the column names will just be the name of the estimator step with an
+appended index number. If you do not specify ``infer_dataframe=True``, the dag will
+leave the outputs unmodified, which in most cases will mean numpy arrays that only
+support numeric column indices.
 
 The DAG may now be used as an estimator in its own right:
 
@@ -189,7 +206,7 @@ as a dictionary of step name to column indices instead:
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.svm import SVC
     >>> clf_stack = (
-    ...     DAGBuilder()
+    ...     DAGBuilder(infer_dataframe=True)
     ...     .add_step("pass", "passthrough")
     ...     .add_step("rf", RandomForestClassifier(), deps=["pass"])
     ...     .add_step("svr", SVC(), deps=["pass"])
diff --git a/requirements_test.txt b/requirements_test.txt
@@ -1,2 +1,3 @@
+pandas
 pytest
 pytest-cov
diff --git a/skdag/_version.py b/skdag/_version.py
@@ -1 +1 @@
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/skdag/dag/_builder.py b/skdag/dag/_builder.py
@@ -16,6 +16,15 @@ class DAGBuilder:
     that reference each step by name. Note that steps must be defined before they are
     used as dependencies.
 
+    Parameters
+    ----------
+
+    infer_dataframe : bool, default = False
+        If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step` is
+        called, if ``dataframe_columns`` is set to ``None``. This effectively makes the
+        resulting DAG always try to coerce output into pandas DataFrames wherever
+        possible.
+
     See Also
     --------
     :class:`skdag.DAG` : The estimator DAG created by this utility.
@@ -43,10 +52,66 @@ class DAGBuilder:
     o    lr
     """
 
-    def __init__(self):
+    def __init__(self, infer_dataframe=False):
         self.graph = nx.DiGraph()
+        self.infer_dataframe = infer_dataframe
+
+    def from_pipeline(self, steps, **kwargs):
+        """
+        Construct a DAG from a simple linear sequence of steps. The resulting DAG will
+        be equivalent to a :class:`~sklearn.pipeline.Pipeline`.
+
+        Parameters
+        ----------
+
+        steps : sequence of (str, estimator)
+            An ordered sequence of pipeline steps. A step is simply a pair of
+            ``(name, estimator)``, just like a scikit-learn Pipeline.
+
+        infer_dataframe : bool, default = False
+            If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step`
+            is called, if ``dataframe_columns`` is set to ``None``. This effectively
+            makes the resulting DAG always try to coerce output into pandas DataFrames
+            wherever possible.
+
+        kwargs : kwargs
+            Any other hyperparameters that are accepted by :class:`~skdag.dag.DAG`'s
+            contructor.
+        """
+        if hasattr(steps, "steps"):
+            pipe = steps
+            steps = pipe.steps
+            if hasattr(pipe, "get_params"):
+                kwargs = {
+                    **{
+                        k: v
+                        for k, v in pipe.get_params().items()
+                        if k in ("memory", "verbose")
+                    },
+                    **kwargs,
+                }
+
+        dfcols = "infer" if self.infer_dataframe else None
+
+        for i in range(len(steps)):
+            name, estimator = steps[i]
+            self._validate_name(name)
+            deps = {}
+            if i > 0:
+                dep = steps[i - 1][0]
+                deps[dep] = None
+            self._validate_deps(deps)
+
+            step = DAGStep(name, estimator, deps, dataframe_columns=dfcols)
+            self.graph.add_node(name, step=step)
+            if deps:
+                self.graph.add_edge(dep, name)
 
-    def add_step(self, name, est, deps=None):
+        self._validate_graph()
+
+        return self
+
+    def add_step(self, name, est, deps=None, dataframe_columns=None):
         self._validate_name(name)
         if isinstance(deps, Sequence):
             deps = {dep: None for dep in deps}
@@ -56,7 +121,12 @@ def add_step(self, name, est, deps=None):
         else:
             deps = {}
 
-        step = DAGStep(name, est, deps=deps)
+        if dataframe_columns is None and self.infer_dataframe:
+            dfcols = "infer"
+        else:
+            dfcols = dataframe_columns
+
+        step = DAGStep(name, est, deps=deps, dataframe_columns=dfcols)
         self.graph.add_node(name, step=step)
 
         for dep in deps:
diff --git a/skdag/dag/_dag.py b/skdag/dag/_dag.py
diff --git a/skdag/dag/_utils.py b/skdag/dag/_utils.py
diff --git a/skdag/dag/tests/test_dag.py b/skdag/dag/tests/test_dag.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
	`1`	`+pandas`
`1`	`2`	`pytest`
`2`	`3`	`pytest-cov`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.2"`
	`1`	`+__version__ = "0.0.3"`