Skip to content

Commit a3a17d8

Browse files
committed
v0.0.5
1 parent 1a329c2 commit a3a17d8

File tree

13 files changed

+190
-89
lines changed

13 files changed

+190
-89
lines changed

doc/_static/img/cover.png

-10.1 KB
Loading

doc/_static/img/dag2.png

-55.4 KB
Loading

doc/_static/img/dag2a.png

50.1 KB
Loading

doc/_static/img/dag3.png

-80.2 KB
Loading

doc/_static/img/dag3a.png

62.3 KB
Loading

doc/quick_start.rst

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,26 @@ The simplest DAGs are just a chain of singular dependencies. These DAGs may be
2626
created from the :meth:`skdag.dag.DAG.from_pipeline` method in the same way as a
2727
DAG:
2828

29-
>>> from sklearn.decomposition import PCA
30-
>>> from sklearn.impute import SimpleImputer
31-
>>> from sklearn.linear_model import LogisticRegression
32-
>>> dag = DAG.from_pipeline(
33-
... steps=[
34-
... ("impute", SimpleImputer()),
35-
... ("pca", PCA()),
36-
... ("lr", LogisticRegression())
37-
... ]
38-
... )
39-
>>> dag.draw()
40-
o impute
41-
|
42-
o pca
43-
|
44-
o lr
45-
<BLANKLINE>
29+
.. code-block:: python
30+
31+
>>> from skdag import DAGBuilder
32+
>>> from sklearn.decomposition import PCA
33+
>>> from sklearn.impute import SimpleImputer
34+
>>> from sklearn.linear_model import LogisticRegression
35+
>>> dag = DAGBuilder().from_pipeline(
36+
... steps=[
37+
... ("impute", SimpleImputer()),
38+
... ("pca", PCA()),
39+
... ("lr", LogisticRegression())
40+
... ]
41+
... ).make_dag()
42+
>>> dag.show()
43+
o impute
44+
|
45+
o pca
46+
|
47+
o lr
48+
<BLANKLINE>
4649
4750
.. image:: _static/img/dag1.png
4851

@@ -52,7 +55,6 @@ estimator:
5255

5356
.. code-block:: python
5457
55-
>>> from skdag import DAGBuilder
5658
>>> dag = (
5759
... DAGBuilder(infer_dataframe=True)
5860
... .add_step("impute", SimpleImputer())
@@ -61,15 +63,15 @@ estimator:
6163
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
6264
... .make_dag()
6365
... )
64-
>>> dag.draw()
66+
>>> dag.show()
6567
o impute
6668
|\
6769
o o blood,vitals
6870
|/
6971
o lr
7072
<BLANKLINE>
7173
72-
.. image:: _static/img/dag2.png
74+
.. image:: _static/img/dag2a.png
7375

7476
In the above examples we pass the first four columns directly to a regressor, but
7577
the remaining columns have dimensionality reduction applied first before being
@@ -82,36 +84,36 @@ on how to control this behaviour, see the `User Guide <user_guide.html>`_.
8284
The DAG may now be used as an estimator in its own right:
8385

8486
>>> from sklearn import datasets
85-
>>> X, y = datasets.load_diabetes(return_X_y=True)
86-
>>> dag.fit_predict(X, y)
87-
array([...
87+
>>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
88+
>>> type(dag.fit_predict(X, y))
89+
<class 'pandas.core.series.Series'>
8890

8991
In an extension to the scikit-learn estimator interface, DAGs also support multiple
9092
inputs and multiple outputs. Let's say we want to compare two different classifiers:
9193

9294
>>> from sklearn.ensemble import RandomForestClassifier
93-
>>> cal = DAG.from_pipeline(
95+
>>> cal = DAGBuilder(infer_dataframe=True).from_pipeline(
9496
... [("rf", RandomForestClassifier(random_state=0))]
95-
... )
97+
... ).make_dag()
9698
>>> dag2 = dag.join(cal, edges=[("blood", "rf"), ("vitals", "rf")])
97-
>>> dag2.draw()
99+
>>> dag2.show()
98100
o impute
99101
|\
100102
o o blood,vitals
101103
|x|
102104
o o lr,rf
103105
<BLANKLINE>
104106

105-
.. image:: _static/img/dag3.png
107+
.. image:: _static/img/dag3a.png
106108

107109
Now our DAG will return two outputs: one from each classifier. Multiple outputs are
108110
returned as a :class:`sklearn.utils.Bunch<Bunch>`:
109111

110112
>>> y_pred = dag2.fit_predict(X, y)
111-
>>> y_pred.lr
112-
array([...
113-
>>> y_pred.rf
114-
array([...
113+
>>> type(y_pred.lr)
114+
<class 'pandas.core.series.Series'>
115+
>>> type(y_pred.rf)
116+
<class 'pandas.core.series.Series'>
115117

116118
Similarly, multiple inputs are also acceptable and inputs can be provided by
117119
specifying ``X`` and ``y`` as ``dict``-like objects.

doc/user_guide.rst

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
1818

1919
.. code-block:: python
2020
21+
>>> from skdag import DAGBuilder
2122
>>> from sklearn.decomposition import PCA
2223
>>> from sklearn.impute import SimpleImputer
2324
>>> from sklearn.linear_model import LogisticRegression
24-
>>> dag = DAG.from_pipeline(
25+
>>> dag = DAGBuilder(infer_dataframe=True).from_pipeline(
2526
... steps=[
2627
... ("impute", SimpleImputer()),
2728
... ("pca", PCA()),
2829
... ("lr", LogisticRegression())
29-
... ],
30-
... infer_dataframe=True,
31-
... )
30+
... ]
31+
... ).make_dag()
3232
3333
You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
3434
notbook environment this will display an image, whereas in a terminal it will generate
@@ -97,19 +97,20 @@ The DAG may now be used as an estimator in its own right:
9797
.. code-block:: python
9898
9999
>>> from sklearn import datasets
100-
>>> X, y = datasets.load_diabetes(return_X_y=True)
101-
>>> dag.fit_predict(X, y)
102-
array([...
100+
>>> X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
101+
>>> y_hat = dag.fit_predict(X, y)
102+
>>> type(y_hat)
103+
<class 'pandas.core.series.Series'>
103104
104105
In an extension to the scikit-learn estimator interface, DAGs also support multiple
105106
inputs and multiple outputs. Let's say we want to compare two different classifiers:
106107

107108
.. code-block:: python
108109
109110
>>> from sklearn.ensemble import RandomForestClassifier
110-
>>> rf = DAG.from_pipeline(
111+
>>> rf = DAGBuilder().from_pipeline(
111112
... [("rf", RandomForestClassifier(random_state=0))]
112-
... )
113+
... ).make_dag()
113114
>>> dag2 = dag.join(rf, edges=[("blood", "rf"), ("vitals", "rf")])
114115
>>> dag2.show()
115116
o impute
@@ -126,10 +127,14 @@ returned as a :class:`sklearn.utils.Bunch<Bunch>`:
126127
.. code-block:: python
127128
128129
>>> y_pred = dag2.fit_predict(X, y)
129-
>>> y_pred.lr
130-
array([...
131-
>>> y_pred.rf
132-
array([...
130+
>>> type(y_pred.lr)
131+
<class 'pandas.core.series.Series'>
132+
>>> type(y_pred.rf)
133+
<class 'numpy.ndarray'>
134+
135+
Note that we have different types of output here because ``LogisticRegression`` natively
136+
supports dataframe input whereas ``RandomForestClassifier`` does not. We could fix this
137+
by specifying ``infer_dataframe=True`` when we createed our ``rf`` DAG extension.
133138

134139
Similarly, multiple inputs are also acceptable and inputs can be provided by
135140
specifying ``X`` and ``y`` as ``dict``-like objects.
@@ -174,6 +179,7 @@ the next step(s).
174179
... .make_dag()
175180
... )
176181
>>> stack.fit(X_train, y_train)
182+
DAG(...
177183
178184
.. image:: _static/img/stack.png
179185
@@ -210,7 +216,7 @@ as a dictionary of step name to column indices instead:
210216
... .add_step("pass", "passthrough")
211217
... .add_step("rf", RandomForestClassifier(), deps=["pass"])
212218
... .add_step("svr", SVC(), deps=["pass"])
213-
... .add_step("meta", LinearRegression(), deps={"rf": 1, "svc": 1}])
219+
... .add_step("meta", LinearRegression(), deps={"rf": 1, "svr": 1})
214220
... .make_dag()
215221
... )
216222

setup.cfg

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ description-file = README.rst
55
test = pytest
66

77
[tool:pytest]
8+
doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
9+
testpaths = .
810
addopts =
911
-s
1012
--doctest-modules
13+
--doctest-glob="*.rst"
1114
--cov=skdag
1215
--ignore setup.py
1316
--ignore doc/_build

skdag/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.4"
1+
__version__ = "0.0.5"

skdag/dag/_dag.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from copy import deepcopy
88
from inspect import signature
99
from itertools import chain
10+
from typing import Iterable
1011

1112
import networkx as nx
1213
import numpy as np
@@ -32,12 +33,34 @@
3233
__all__ = ["DAG", "DAGStep"]
3334

3435

36+
def _get_columns(X, dep, cols, is_root, axis=1):
37+
if callable(cols):
38+
# sklearn.compose.make_column_selector
39+
cols = cols(X)
40+
41+
if not is_root:
42+
# The DAG will prepend output columns with the step name, so add this in to any
43+
# dep columns if missing. This helps keep user-provided deps readable.
44+
if isinstance(cols, str):
45+
cols = cols if cols.startswith(f"{dep}__") else f"{dep}__{cols}"
46+
elif isinstance(cols, Iterable):
47+
orig = cols
48+
cols = []
49+
for col in orig:
50+
if isinstance(col, str):
51+
cols.append(col if col.startswith(f"{dep}__") else f"{dep}__{col}")
52+
else:
53+
cols.append(col)
54+
55+
return _safe_indexing(X, cols, axis=axis)
56+
57+
3558
def _stack_inputs(dag, X, node):
3659
# For root nodes, the dependency is just the node name itself.
3760
deps = {node.name: None} if node.is_root else node.deps
3861

3962
cols = [
40-
X[dep][cols(X[dep])] if callable(cols) else _safe_indexing(X[dep], cols, axis=1)
63+
_get_columns(X[dep], dep, cols, node.is_root, axis=1)
4164
for dep, cols in deps.items()
4265
]
4366

@@ -204,7 +227,7 @@ def _parallel_transform(dag, step, Xin, Xs, transform_fn, **fn_params):
204227
clsname = type(dag).__name__
205228
with _print_elapsed_time(clsname, dag._log_message(step)):
206229
if transformer is None or transformer == "passthrough":
207-
Xt = X
230+
Xt = X
208231
else:
209232
# Fit or load from cache the current transformer
210233
Xt = transform_fn(

0 commit comments

Comments
 (0)