Skip to content

Commit 4f849b4

Browse files
committed
v0.0.3 - pandas
1 parent 9c306e0 commit 4f849b4

File tree

10 files changed

+428
-171
lines changed

10 files changed

+428
-171
lines changed

appveyor.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ install: |
1919
for f in $(find . -maxdepth 1 -name 'requirements*.txt'); do
2020
pip install -r ${f}
2121
done
22-
pip install pandas # Needed for some estimator checks.
2322
pip install .
2423
2524
test_script:

doc/index.rst

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,28 @@ cases including complex pre-processing, model stacking and benchmarking.
1313
from skdag import DAGBuilder
1414
1515
dag = (
16-
DAGBuilder()
16+
DAGBuilder(infer_dataframe=True)
1717
.add_step("impute", SimpleImputer())
18-
.add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
18+
.add_step(
19+
"vitals",
20+
"passthrough",
21+
deps={"impute": ["age", "sex", "bmi", "bp"]},
22+
)
1923
.add_step(
2024
"blood",
2125
PCA(n_components=2, random_state=0),
22-
deps={"impute": slice(4, 10)}
26+
deps={"impute": ["s1", "s2", "s3", "s4", "s5", "s6"]},
2327
)
2428
.add_step(
2529
"rf",
2630
RandomForestRegressor(max_depth=5, random_state=0),
27-
deps=["blood", "vitals"]
31+
deps=["blood", "vitals"],
2832
)
2933
.add_step("svm", SVR(C=0.7), deps=["blood", "vitals"])
3034
.add_step(
3135
"knn",
3236
KNeighborsRegressor(n_neighbors=5),
33-
deps=["blood", "vitals"]
37+
deps=["blood", "vitals"],
3438
)
3539
.add_step("meta", LinearRegression(), deps=["rf", "svm", "knn"])
3640
.make_dag(n_jobs=2, verbose=True)

doc/quick_start.rst

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,31 +50,34 @@ For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
5050
which allows you to define the graph by specifying the dependencies of each new
5151
estimator:
5252

53-
>>> from skdag import DAGBuilder
54-
>>> dag = (
55-
... DAGBuilder()
56-
... .add_step("impute", SimpleImputer())
57-
... .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
58-
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
59-
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
60-
... .make_dag()
61-
... )
62-
>>> dag.draw()
63-
o impute
64-
|\
65-
o o blood,vitals
66-
|/
67-
o lr
68-
<BLANKLINE>
53+
.. code-block:: python
54+
55+
>>> from skdag import DAGBuilder
56+
>>> dag = (
57+
... DAGBuilder(infer_dataframe=True)
58+
... .add_step("impute", SimpleImputer())
59+
... .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
60+
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
61+
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
62+
... .make_dag()
63+
... )
64+
>>> dag.draw()
65+
o impute
66+
|\
67+
o o blood,vitals
68+
|/
69+
o lr
70+
<BLANKLINE>
6971
7072
.. image:: _static/img/dag2.png
7173

7274
In the above examples we pass the first four columns directly to a regressor, but
7375
the remaining columns have dimensionality reduction applied first before being
74-
passed to the same regressor as extra input columns. Note that we can define our graph
75-
edges in two different ways: as a dict (if we need to select only certain columns from
76-
the source node) or as a simple list (if we want to simply grab all columns from all
77-
input nodes).
76+
passed to the same regressor as extra input columns.
77+
78+
In this DAG, as well as using the ``deps`` option to control which estimators feed in to
79+
other estimators, but which columns are used (and ignored) by each step. For more detail
80+
on how to control this behaviour, see the `User Guide <user_guide.html>`_.
7881

7982
The DAG may now be used as an estimator in its own right:
8083

doc/user_guide.rst

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
2626
... ("impute", SimpleImputer()),
2727
... ("pca", PCA()),
2828
... ("lr", LogisticRegression())
29-
... ]
29+
... ],
30+
... infer_dataframe=True,
3031
... )
3132
3233
You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
@@ -44,18 +45,25 @@ ASCII text:
4445
4546
.. image:: _static/img/dag1.png
4647

48+
Note that we also provided an extra option, ``infer_dataframe``. This is entirely
49+
optional, but if set the DAG will ensure that dataframe inputs have column and index
50+
information preserved (or inferred), and the output of the pipeline will also be a
51+
dataframe. This is useful if you wish to filter down the inputs for one particular step
52+
to only include certain columns; something we shall see in action later.
53+
4754
For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
4855
which allows you to define the graph by specifying the dependencies of each new
4956
estimator:
5057

5158
.. code-block:: python
5259
5360
>>> from skdag import DAGBuilder
61+
>>> from sklearn.compose import make_column_selector
5462
>>> dag = (
55-
... DAGBuilder()
63+
... DAGBuilder(infer_dataframe=True)
5664
... .add_step("impute", SimpleImputer())
57-
... .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
58-
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
65+
... .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
66+
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": make_column_selector("s[0-9]+")})
5967
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
6068
... .make_dag()
6169
... )
@@ -73,7 +81,16 @@ the remaining columns have dimensionality reduction applied first before being
7381
passed to the same regressor. Note that we can define our graph edges in two
7482
different ways: as a dict (if we need to select only certain columns from the source
7583
node) or as a simple list (if we want to simply grab all columns from all input
76-
nodes).
84+
nodes). Columns may be specified as any kind of iterable (list, slice etc.) or a column
85+
selector function that conforms to :meth:`sklearn.compose.make_column_selector`.
86+
87+
If you wish to specify string column names for dependencies, ensure you provide the
88+
``infer_dataframe=True`` option when you create a dag. This will ensure that all
89+
estimator outputs are coerced into dataframes. Where possible column names will be
90+
inferred, otherwise the column names will just be the name of the estimator step with an
91+
appended index number. If you do not specify ``infer_dataframe=True``, the dag will
92+
leave the outputs unmodified, which in most cases will mean numpy arrays that only
93+
support numeric column indices.
7794

7895
The DAG may now be used as an estimator in its own right:
7996

@@ -189,7 +206,7 @@ as a dictionary of step name to column indices instead:
189206
>>> from sklearn.ensemble import RandomForestClassifier
190207
>>> from sklearn.svm import SVC
191208
>>> clf_stack = (
192-
... DAGBuilder()
209+
... DAGBuilder(infer_dataframe=True)
193210
... .add_step("pass", "passthrough")
194211
... .add_step("rf", RandomForestClassifier(), deps=["pass"])
195212
... .add_step("svr", SVC(), deps=["pass"])

requirements_test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
pandas
12
pytest
23
pytest-cov

skdag/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.2"
1+
__version__ = "0.0.3"

skdag/dag/_builder.py

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@ class DAGBuilder:
1616
that reference each step by name. Note that steps must be defined before they are
1717
used as dependencies.
1818
19+
Parameters
20+
----------
21+
22+
infer_dataframe : bool, default = False
23+
If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step` is
24+
called, if ``dataframe_columns`` is set to ``None``. This effectively makes the
25+
resulting DAG always try to coerce output into pandas DataFrames wherever
26+
possible.
27+
1928
See Also
2029
--------
2130
:class:`skdag.DAG` : The estimator DAG created by this utility.
@@ -43,10 +52,66 @@ class DAGBuilder:
4352
o lr
4453
"""
4554

46-
def __init__(self):
55+
def __init__(self, infer_dataframe=False):
4756
self.graph = nx.DiGraph()
57+
self.infer_dataframe = infer_dataframe
58+
59+
def from_pipeline(self, steps, **kwargs):
60+
"""
61+
Construct a DAG from a simple linear sequence of steps. The resulting DAG will
62+
be equivalent to a :class:`~sklearn.pipeline.Pipeline`.
63+
64+
Parameters
65+
----------
66+
67+
steps : sequence of (str, estimator)
68+
An ordered sequence of pipeline steps. A step is simply a pair of
69+
``(name, estimator)``, just like a scikit-learn Pipeline.
70+
71+
infer_dataframe : bool, default = False
72+
If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step`
73+
is called, if ``dataframe_columns`` is set to ``None``. This effectively
74+
makes the resulting DAG always try to coerce output into pandas DataFrames
75+
wherever possible.
76+
77+
kwargs : kwargs
78+
Any other hyperparameters that are accepted by :class:`~skdag.dag.DAG`'s
79+
contructor.
80+
"""
81+
if hasattr(steps, "steps"):
82+
pipe = steps
83+
steps = pipe.steps
84+
if hasattr(pipe, "get_params"):
85+
kwargs = {
86+
**{
87+
k: v
88+
for k, v in pipe.get_params().items()
89+
if k in ("memory", "verbose")
90+
},
91+
**kwargs,
92+
}
93+
94+
dfcols = "infer" if self.infer_dataframe else None
95+
96+
for i in range(len(steps)):
97+
name, estimator = steps[i]
98+
self._validate_name(name)
99+
deps = {}
100+
if i > 0:
101+
dep = steps[i - 1][0]
102+
deps[dep] = None
103+
self._validate_deps(deps)
104+
105+
step = DAGStep(name, estimator, deps, dataframe_columns=dfcols)
106+
self.graph.add_node(name, step=step)
107+
if deps:
108+
self.graph.add_edge(dep, name)
48109

49-
def add_step(self, name, est, deps=None):
110+
self._validate_graph()
111+
112+
return self
113+
114+
def add_step(self, name, est, deps=None, dataframe_columns=None):
50115
self._validate_name(name)
51116
if isinstance(deps, Sequence):
52117
deps = {dep: None for dep in deps}
@@ -56,7 +121,12 @@ def add_step(self, name, est, deps=None):
56121
else:
57122
deps = {}
58123

59-
step = DAGStep(name, est, deps=deps)
124+
if dataframe_columns is None and self.infer_dataframe:
125+
dfcols = "infer"
126+
else:
127+
dfcols = dataframe_columns
128+
129+
step = DAGStep(name, est, deps=deps, dataframe_columns=dfcols)
60130
self.graph.add_node(name, step=step)
61131

62132
for dep in deps:

0 commit comments

Comments
 (0)