Merge branch 'main' of github.com:big-o/skdag into main

big-o · big-o · commit 1691bbb6851b · 2022-07-31T22:25:59.000+01:00
diff --git a/doc/_static/img/cover.svg b/doc/_static/img/cover.svg
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Pages: 1 -->
+<svg width="576pt" height="236pt"
+ viewBox="0.00 0.00 576.00 235.72" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 231.72)">
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-231.72 572,-231.72 572,4 -4,4"/>
+<!-- impute -->
+<g id="node1" class="node">
+<title>impute</title>
+<g id="a_node1"><a xlink:title="SimpleImputer()">
+<polygon fill="none" stroke="black" points="77,-121 0,-121 0,-83 77,-83 77,-121"/>
+<text text-anchor="start" x="14.5" y="-103.8" font-family="SANS" font-size="14.00">impute</text>
+<text text-anchor="start" x="10.5" y="-91.2" font-family="MONOSPACE" font-size="6.00">SimpleImputer()</text>
+</a>
+</g>
+</g>
+<!-- vitals -->
+<g id="node2" class="node">
+<title>vitals</title>
+<g id="a_node2"><a xlink:title="&#39;passthrough&#39;">
+<polygon fill="none" stroke="black" points="226,-142 197.75,-173 141.25,-173 113,-142 141.25,-111 197.75,-111 226,-142"/>
+<text text-anchor="start" x="151.5" y="-143.8" font-family="SANS" font-size="14.00">vitals</text>
+<text text-anchor="start" x="145.5" y="-131.2" font-family="MONOSPACE" font-size="6.00">&quot;passthrough&quot;</text>
+</a>
+</g>
+</g>
+<!-- impute&#45;&gt;vitals -->
+<g id="edge1" class="edge">
+<title>impute&#45;&gt;vitals</title>
+<path fill="none" stroke="black" d="M77.16,-113.68C89.13,-117.39 102.6,-121.57 115.45,-125.55"/>
+<polygon fill="black" stroke="black" points="114.53,-128.93 125.11,-128.55 116.6,-122.24 114.53,-128.93"/>
+</g>
+<!-- blood -->
+<g id="node3" class="node">
+<title>blood</title>
+<g id="a_node3"><a xlink:title="PCA(n_components=2, random_state=0)">
+<polygon fill="none" stroke="black" points="215.5,-92.5 123.5,-92.5 123.5,-33.5 215.5,-33.5 215.5,-92.5"/>
+<text text-anchor="start" x="150" y="-74.8" font-family="SANS" font-size="14.00">blood</text>
+<text text-anchor="start" x="133.5" y="-62.2" font-family="MONOSPACE" font-size="6.00">PCA(</text>
+<text text-anchor="start" x="133.5" y="-55.2" font-family="MONOSPACE" font-size="6.00"> &#160;&#160;&#160;n_components=2,</text>
+<text text-anchor="start" x="133.5" y="-48.2" font-family="MONOSPACE" font-size="6.00"> &#160;&#160;&#160;random_state=0,</text>
+<text text-anchor="start" x="133.5" y="-41.2" font-family="MONOSPACE" font-size="6.00">)</text>
+</a>
+</g>
+</g>
+<!-- impute&#45;&gt;blood -->
+<g id="edge2" class="edge">
+<title>impute&#45;&gt;blood</title>
+<path fill="none" stroke="black" d="M77.16,-90.61C88.54,-87.17 101.28,-83.32 113.55,-79.61"/>
+<polygon fill="black" stroke="black" points="114.7,-82.92 123.26,-76.68 112.67,-76.22 114.7,-82.92"/>
+</g>
+<!-- rf -->
+<g id="node4" class="node">
+<title>rf</title>
+<g id="a_node4"><a xlink:title="RandomForestRegressor(max_depth=5, random_state=0)">
+<ellipse fill="none" stroke="black" cx="335" cy="-186" rx="73" ry="41.94"/>
+<text text-anchor="start" x="329.5" y="-197.8" font-family="SANS" font-size="14.00">rf</text>
+<text text-anchor="start" x="294" y="-185.2" font-family="MONOSPACE" font-size="6.00">RandomForestRegressor(</text>
+<text text-anchor="start" x="294" y="-178.2" font-family="MONOSPACE" font-size="6.00"> &#160;&#160;&#160;max_depth=5,</text>
+<text text-anchor="start" x="294" y="-171.2" font-family="MONOSPACE" font-size="6.00"> &#160;&#160;&#160;random_state=0,</text>
+<text text-anchor="start" x="294" y="-164.2" font-family="MONOSPACE" font-size="6.00">)</text>
+</a>
+</g>
+</g>
+<!-- vitals&#45;&gt;rf -->
+<g id="edge3" class="edge">
+<title>vitals&#45;&gt;rf</title>
+<path fill="none" stroke="black" d="M215.18,-154.02C228.67,-157.65 243.88,-161.75 258.79,-165.76"/>
+<polygon fill="black" stroke="black" points="257.96,-169.16 268.53,-168.38 259.78,-162.4 257.96,-169.16"/>
+</g>
+<!-- svm -->
+<g id="node5" class="node">
+<title>svm</title>
+<g id="a_node5"><a xlink:title="SVR(C=0.7)">
+<ellipse fill="none" stroke="black" cx="335" cy="-99" rx="41" ry="27"/>
+<text text-anchor="start" x="320" y="-100.8" font-family="SANS" font-size="14.00">svm</text>
+<text text-anchor="start" x="316" y="-88.2" font-family="MONOSPACE" font-size="6.00">SVR(C=0.7)</text>
+</a>
+</g>
+</g>
+<!-- vitals&#45;&gt;svm -->
+<g id="edge4" class="edge">
+<title>vitals&#45;&gt;svm</title>
+<path fill="none" stroke="black" d="M215.61,-130.14C237.74,-124.32 264.39,-117.31 286.8,-111.41"/>
+<polygon fill="black" stroke="black" points="287.98,-114.72 296.76,-108.79 286.2,-107.95 287.98,-114.72"/>
+</g>
+<!-- knn -->
+<g id="node6" class="node">
+<title>knn</title>
+<g id="a_node6"><a xlink:title="KNeighborsRegressor()">
+<ellipse fill="none" stroke="black" cx="335" cy="-27" rx="70" ry="27"/>
+<text text-anchor="start" x="322" y="-28.8" font-family="SANS" font-size="14.00">knn</text>
+<text text-anchor="start" x="296" y="-16.2" font-family="MONOSPACE" font-size="6.00">KNeighborsRegressor()</text>
+</a>
+</g>
+</g>
+<!-- vitals&#45;&gt;knn -->
+<g id="edge5" class="edge">
+<title>vitals&#45;&gt;knn</title>
+<path fill="none" stroke="black" d="M205.09,-118.72C212.21,-113.47 219.51,-107.75 226,-102 243.66,-86.36 243.14,-77.17 262,-63 267.13,-59.14 272.74,-55.51 278.49,-52.15"/>
+<polygon fill="black" stroke="black" points="280.55,-55.01 287.59,-47.09 277.15,-48.89 280.55,-55.01"/>
+</g>
+<!-- blood&#45;&gt;rf -->
+<g id="edge6" class="edge">
+<title>blood&#45;&gt;rf</title>
+<path fill="none" stroke="black" d="M213.93,-92.7C218.06,-95.77 222.15,-98.9 226,-102 242.91,-115.6 245.11,-121.37 262,-135 267.37,-139.33 273.07,-143.72 278.83,-148"/>
+<polygon fill="black" stroke="black" points="277.14,-151.1 287.27,-154.19 281.28,-145.46 277.14,-151.1"/>
+</g>
+<!-- blood&#45;&gt;svm -->
+<g id="edge7" class="edge">
+<title>blood&#45;&gt;svm</title>
+<path fill="none" stroke="black" d="M215.61,-72.93C237.5,-77.75 263.8,-83.54 286.05,-88.44"/>
+<polygon fill="black" stroke="black" points="285.44,-91.89 295.95,-90.62 286.94,-85.05 285.44,-91.89"/>
+</g>
+<!-- blood&#45;&gt;knn -->
+<g id="edge8" class="edge">
+<title>blood&#45;&gt;knn</title>
+<path fill="none" stroke="black" d="M215.61,-53.07C230.48,-49.79 247.39,-46.07 263.69,-42.48"/>
+<polygon fill="black" stroke="black" points="264.72,-45.84 273.73,-40.27 263.21,-39 264.72,-45.84"/>
+</g>
+<!-- meta -->
+<g id="node7" class="node">
+<title>meta</title>
+<g id="a_node7"><a xlink:title="LinearRegression()">
+<ellipse fill="none" stroke="black" cx="506" cy="-99" rx="62" ry="27"/>
+<text text-anchor="start" x="488" y="-100.8" font-family="SANS" font-size="14.00">meta</text>
+<text text-anchor="start" x="472" y="-88.2" font-family="MONOSPACE" font-size="6.00">LinearRegression()</text>
+</a>
+</g>
+</g>
+<!-- rf&#45;&gt;meta -->
+<g id="edge9" class="edge">
+<title>rf&#45;&gt;meta</title>
+<path fill="none" stroke="black" d="M389.94,-158.23C411.23,-147.27 435.51,-134.77 456.26,-124.09"/>
+<polygon fill="black" stroke="black" points="457.93,-127.17 465.22,-119.48 454.73,-120.94 457.93,-127.17"/>
+</g>
+<!-- svm&#45;&gt;meta -->
+<g id="edge10" class="edge">
+<title>svm&#45;&gt;meta</title>
+<path fill="none" stroke="black" d="M376.05,-99C393.26,-99 413.9,-99 433.57,-99"/>
+<polygon fill="black" stroke="black" points="433.75,-102.5 443.75,-99 433.75,-95.5 433.75,-102.5"/>
+</g>
+<!-- knn&#45;&gt;meta -->
+<g id="edge11" class="edge">
+<title>knn&#45;&gt;meta</title>
+<path fill="none" stroke="black" d="M382.63,-46.86C404.08,-56.01 429.65,-66.9 451.85,-76.36"/>
+<polygon fill="black" stroke="black" points="450.57,-79.61 461.14,-80.31 453.31,-73.17 450.57,-79.61"/>
+</g>
+</g>
+</svg>
diff --git a/doc/index.rst b/doc/index.rst
@@ -5,7 +5,70 @@ scikit-dag (``skdag``) is an open-sourced, MIT-licenced library that provides ad
 workflow management to any machine learning operations that follow
 :mod:`sklearn` conventions. It does this by introducing Directed Acyclic
 Graphs (:class:`skdag.dag.DAG`) as a drop-in replacement for traditional scikit-learn
-:mod:`sklearn.pipeline.Pipeline`.
+:mod:`sklearn.pipeline.Pipeline`. This gives you a simple interface for a range of use
+cases including complex pre-processing, model stacking and benchmarking.
+
+.. code-block:: python
+
+   from skdag import DAGBuilder
+
+   dag = (
+      DAGBuilder()
+      .add_step("impute", SimpleImputer())
+      .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
+      .add_step(
+         "blood",
+         PCA(n_components=2, random_state=0),
+         deps={"impute": slice(4, 10)}
+      )
+      .add_step(
+         "rf",
+         RandomForestRegressor(max_depth=5, random_state=0),
+         deps=["blood", "vitals"]
+      )
+      .add_step("svm", SVR(C=0.7), deps=["blood", "vitals"])
+      .add_step(
+         "knn",
+         KNeighborsRegressor(n_neighbors=5),
+         deps=["blood", "vitals"]
+      )
+      .add_step("meta", LinearRegression(), deps=["rf", "svm", "knn"])
+      .make_dag(n_jobs=2, verbose=True)
+   )
+
+   dag.show(detailed=True)
+
+.. image:: _static/img/cover.svg
+
+The above DAG imputes missing values, runs PCA on the columns relating to blood test
+results and leaves the other columns as they are. Then they get passed to three
+different regressors before being passed onto a final meta-estimator. Because DAGs
+(unlike pipelines) allow predictors in the middle or a workflow, you can use them to
+implement model stacking. We also chose to run the DAG steps in parallel wherever
+possible.
+
+After building our DAG, we can treat it as any other estimator:
+
+.. code-block:: python
+
+   from sklearn import datasets
+
+   X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)
+   X_train, X_test, y_train, y_test = train_test_split(
+      X, y, test_size=0.2, random_state=0
+   )
+
+   dag.fit(X_train, y_train)
+   dag.predict(X_test)
+
+Just like a pipeline, you can optimise it with a gridsearch, pickle it etc.
+
+Note that this package does not deal with things like delayed dependencies and
+distributed architectures - consider an `established <https://airflow.apache.org/>`_
+`solution <https://dagster.io/>`_ for such use cases. ``skdag`` is just for building and
+executing local ensembles from estimators.
+
+:ref:`Read on<quickstart>` to learn more about ``skdag``...
 
 .. toctree::
    :maxdepth: 2
diff --git a/doc/quick_start.rst b/doc/quick_start.rst
@@ -1,6 +1,8 @@
-#####################################
+.. _quickstart:
+
+######################
 Quick Start with skdag
-#####################################
+######################
 
 The following tutorial shows you how to write some simple directed acyclic graphs (DAGs)
 with ``skdag``.
@@ -21,8 +23,8 @@ to do this in Ubuntu:
 
     sudo apt install graphviz graphviz-dev
 
-Creating your own scikit-learn contribution package
-===================================================
+Creating a DAG
+==============
 
 The simplest DAGs are just a chain of singular dependencies. These DAGs may be
 created from the :meth:`skdag.dag.DAG.from_pipeline` method in the same way as a