scikit-learn-contrib
diff --git a/‎doc/_static/img/cover.png‎
-10.1 KB b/‎doc/_static/img/cover.png‎
-10.1 KB
diff --git a/‎doc/_static/img/dag2.png‎
-55.4 KB b/‎doc/_static/img/dag2.png‎
-55.4 KB
diff --git a/‎doc/_static/img/dag2a.png‎
50.1 KB b/‎doc/_static/img/dag2a.png‎
50.1 KB
diff --git a/‎doc/_static/img/dag3.png‎
-80.2 KB b/‎doc/_static/img/dag3.png‎
-80.2 KB
diff --git a/‎doc/_static/img/dag3a.png‎
62.3 KB b/‎doc/_static/img/dag3a.png‎
62.3 KB
diff --git a/‎doc/quick_start.rst‎
Lines changed: 33 additions & 31 deletions b/‎doc/quick_start.rst‎
Lines changed: 33 additions & 31 deletions
diff --git a/‎doc/user_guide.rst‎
Lines changed: 20 additions & 14 deletions b/‎doc/user_guide.rst‎
Lines changed: 20 additions & 14 deletions
diff --git a/‎setup.cfg‎
Lines changed: 3 additions & 0 deletions b/‎setup.cfg‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎skdag/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎skdag/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎skdag/dag/_dag.py‎
Lines changed: 25 additions & 2 deletions b/‎skdag/dag/_dag.py‎
Lines changed: 25 additions & 2 deletions
@@ -26,23 +26,26 @@ The simplest DAGs are just a chain of singular dependencies. These DAGs may be
 created from the :meth:`skdag.dag.DAG.from_pipeline` method in the same way as a
 DAG:
 
->>> from sklearn.decomposition import PCA
->>> from sklearn.impute import SimpleImputer
->>> from sklearn.linear_model import LogisticRegression
->>> dag = DAG.from_pipeline(
-...     steps=[
-...         ("impute", SimpleImputer()),
-...         ("pca", PCA()),
-...         ("lr", LogisticRegression())
-...     ]
-... )
->>> dag.draw()
-o    impute
-|
-o    pca
-|
-o    lr
-<BLANKLINE>
+.. code-block:: python
+
+    >>> from skdag import DAGBuilder
+    >>> from sklearn.decomposition import PCA
+    >>> from sklearn.impute import SimpleImputer
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> dag = DAGBuilder().from_pipeline(
+    ...     steps=[
+    ...         ("impute", SimpleImputer()),
+    ...         ("pca", PCA()),
+    ...         ("lr", LogisticRegression())
+    ...     ]
+    ... ).make_dag()
+    >>> dag.show()
+    o    impute
+    |
+    o    pca
+    |
+    o    lr
+    <BLANKLINE>
 
 .. image:: _static/img/dag1.png
 
@@ -52,7 +55,6 @@ estimator:
 
 .. code-block:: python
 
-    >>> from skdag import DAGBuilder
     >>> dag = (
     ...     DAGBuilder(infer_dataframe=True)
     ...     .add_step("impute", SimpleImputer())
@@ -61,15 +63,15 @@ estimator:
     ...     .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
     ...     .make_dag()
     ... )
-    >>> dag.draw()
+    >>> dag.show()
     o    impute
     |\
     o o    blood,vitals
     |/
     o    lr
     <BLANKLINE>
 
-.. image:: _static/img/dag2.png
+.. image:: _static/img/dag2a.png
 
 In the above examples we pass the first four columns directly to a regressor, but
 the remaining columns have dimensionality reduction applied first before being
@@ -82,36 +84,36 @@ on how to control this behaviour, see the `User Guide <user_guide.html>`_.
 The DAG may now be used as an estimator in its own right:
 
 >>> from sklearn import datasets
->>> X, y = datasets.load_diabetes(return_X_y=True)
->>> dag.fit_predict(X, y)
-array([...
+>>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
+>>> type(dag.fit_predict(X, y))
+<class 'pandas.core.series.Series'>
 
 In an extension to the scikit-learn estimator interface, DAGs also support multiple
 inputs and multiple outputs. Let's say we want to compare two different classifiers:
 
 >>> from sklearn.ensemble import RandomForestClassifier
->>> cal = DAG.from_pipeline(
+>>> cal = DAGBuilder(infer_dataframe=True).from_pipeline(
 ...     [("rf", RandomForestClassifier(random_state=0))]
-... )
+... ).make_dag()
 >>> dag2 = dag.join(cal, edges=[("blood", "rf"), ("vitals", "rf")])
->>> dag2.draw()
+>>> dag2.show()
 o    impute
 |\
 o o    blood,vitals
 |x|
 o o    lr,rf
 <BLANKLINE>
 
-.. image:: _static/img/dag3.png
+.. image:: _static/img/dag3a.png
 
 Now our DAG will return two outputs: one from each classifier. Multiple outputs are
 returned as a :class:`sklearn.utils.Bunch<Bunch>`:
 
 >>> y_pred = dag2.fit_predict(X, y)
->>> y_pred.lr
-array([...
->>> y_pred.rf
-array([...
+>>> type(y_pred.lr)
+<class 'pandas.core.series.Series'>
+>>> type(y_pred.rf)
+<class 'pandas.core.series.Series'>
 
 Similarly, multiple inputs are also acceptable and inputs can be provided by
 specifying ``X`` and ``y`` as ``dict``-like objects.
@@ -18,17 +18,17 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
 
 .. code-block:: python
 
+    >>> from skdag import DAGBuilder
     >>> from sklearn.decomposition import PCA
     >>> from sklearn.impute import SimpleImputer
     >>> from sklearn.linear_model import LogisticRegression
-    >>> dag = DAG.from_pipeline(
+    >>> dag = DAGBuilder(infer_dataframe=True).from_pipeline(
     ...     steps=[
     ...         ("impute", SimpleImputer()),
     ...         ("pca", PCA()),
     ...         ("lr", LogisticRegression())
-    ...     ],
-    ...     infer_dataframe=True,
-    ... )
+    ...     ]
+    ... ).make_dag()
 
 You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
 notbook environment this will display an image, whereas in a terminal it will generate
@@ -97,19 +97,20 @@ The DAG may now be used as an estimator in its own right:
 .. code-block:: python
 
     >>> from sklearn import datasets
-    >>> X, y = datasets.load_diabetes(return_X_y=True)
-    >>> dag.fit_predict(X, y)
-    array([...
+    >>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
+    >>> y_hat = dag.fit_predict(X, y)
+    >>> type(y_hat)
+    <class 'pandas.core.series.Series'>
 
 In an extension to the scikit-learn estimator interface, DAGs also support multiple
 inputs and multiple outputs. Let's say we want to compare two different classifiers:
 
 .. code-block:: python
 
     >>> from sklearn.ensemble import RandomForestClassifier
-    >>> rf = DAG.from_pipeline(
+    >>> rf = DAGBuilder().from_pipeline(
     ...     [("rf", RandomForestClassifier(random_state=0))]
-    ... )
+    ... ).make_dag()
     >>> dag2 = dag.join(rf, edges=[("blood", "rf"), ("vitals", "rf")])
     >>> dag2.show()
     o    impute
@@ -126,10 +127,14 @@ returned as a :class:`sklearn.utils.Bunch<Bunch>`:
 .. code-block:: python
 
     >>> y_pred = dag2.fit_predict(X, y)
-    >>> y_pred.lr
-    array([...
-    >>> y_pred.rf
-    array([...
+    >>> type(y_pred.lr)
+    <class 'pandas.core.series.Series'>
+    >>> type(y_pred.rf)
+    <class 'numpy.ndarray'>
+
+Note that we have different types of output here because ``LogisticRegression`` natively
+supports dataframe input whereas ``RandomForestClassifier`` does not. We could fix this
+by specifying ``infer_dataframe=True`` when we createed our ``rf`` DAG extension.
 
 Similarly, multiple inputs are also acceptable and inputs can be provided by
 specifying ``X`` and ``y`` as ``dict``-like objects.
@@ -174,6 +179,7 @@ the next step(s).
     ...     .make_dag()
     ... )
     >>> stack.fit(X_train, y_train)
+    DAG(...
 
 .. image:: _static/img/stack.png
 
@@ -210,7 +216,7 @@ as a dictionary of step name to column indices instead:
     ...     .add_step("pass", "passthrough")
     ...     .add_step("rf", RandomForestClassifier(), deps=["pass"])
     ...     .add_step("svr", SVC(), deps=["pass"])
-    ...     .add_step("meta", LinearRegression(), deps={"rf": 1, "svc": 1}])
+    ...     .add_step("meta", LinearRegression(), deps={"rf": 1, "svr": 1})
     ...     .make_dag()
     ... )
 
 
@@ -5,9 +5,12 @@ description-file = README.rst
 test = pytest
 
 [tool:pytest]
+doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
+testpaths = .
 addopts =
     -s
     --doctest-modules
+    --doctest-glob="*.rst"
     --cov=skdag
     --ignore setup.py
     --ignore doc/_build
 
@@ -1 +1 @@
-__version__ = "0.0.4"
+__version__ = "0.0.5"
@@ -7,6 +7,7 @@
 from copy import deepcopy
 from inspect import signature
 from itertools import chain
+from typing import Iterable
 
 import networkx as nx
 import numpy as np
@@ -32,12 +33,34 @@
 __all__ = ["DAG", "DAGStep"]
 
 
+def _get_columns(X, dep, cols, is_root, axis=1):
+    if callable(cols):
+        # sklearn.compose.make_column_selector
+        cols = cols(X)
+
+    if not is_root:
+        # The DAG will prepend output columns with the step name, so add this in to any
+        # dep columns if missing. This helps keep user-provided deps readable.
+        if isinstance(cols, str):
+            cols = cols if cols.startswith(f"{dep}__") else f"{dep}__{cols}"
+        elif isinstance(cols, Iterable):
+            orig = cols
+            cols = []
+            for col in orig:
+                if isinstance(col, str):
+                    cols.append(col if col.startswith(f"{dep}__") else f"{dep}__{col}")
+                else:
+                    cols.append(col)
+
+    return _safe_indexing(X, cols, axis=axis)
+
+
 def _stack_inputs(dag, X, node):
     # For root nodes, the dependency is just the node name itself.
     deps = {node.name: None} if node.is_root else node.deps
 
     cols = [
-        X[dep][cols(X[dep])] if callable(cols) else _safe_indexing(X[dep], cols, axis=1)
+        _get_columns(X[dep], dep, cols, node.is_root, axis=1)
         for dep, cols in deps.items()
     ]
 
@@ -204,7 +227,7 @@ def _parallel_transform(dag, step, Xin, Xs, transform_fn, **fn_params):
     clsname = type(dag).__name__
     with _print_elapsed_time(clsname, dag._log_message(step)):
         if transformer is None or transformer == "passthrough":
-                Xt = X
+            Xt = X
         else:
             # Fit or load from cache the current transformer
             Xt = transform_fn(
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.4"`
	`1`	`+__version__ = "0.0.5"`