model: scikit: examples: Testable LR

pdxjohnny · pdxjohnny · commit da2d71420ec7 · 2020-03-02T15:15:12.000-08:00
Signed-off-by: John Andersen &lt;johnandersenpdx@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Use randomly generated data for scikit tests
 - Change Core to Official to clarify who maintains each plugin
 - Name of output of unsupervised model from "Prediction" to "cluster"
+- Test scikit LR documentation examples in CI
 
 ## [0.3.4] - 2020-02-28
 ### Added
diff --git a/docs/plugins/dffml_model.rst b/docs/plugins/dffml_model.rst
@@ -684,7 +684,7 @@ Predicting with trained model:
 
 **Usage Example:**
 
-Example below uses LinearRegression Model on a small dataset.
+Example below uses LinearRegression Model using the command line.
 
 Let us take a simple example:
 
@@ -704,43 +704,34 @@ Let us take a simple example:
 |          5           |     11     |      1.2     |   60   |
 +----------------------+------------+--------------+--------+
 
-.. code-block:: console
+First we create the files
+
+.. literalinclude:: /../model/scikit/examples/lr/train_data.sh
+
+.. literalinclude:: /../model/scikit/examples/lr/test_data.sh
+
+Train the model
+
+.. literalinclude:: /../model/scikit/examples/lr/train.sh
+
+Assess accuracy
+
+.. literalinclude:: /../model/scikit/examples/lr/accuracy.sh
+
+Output:
+
+.. code-block::
 
-    $ cat > train.csv << EOF
-    Years,Expertise,Trust,Salary
-    0,1,0.2,10
-    1,3,0.4,20
-    2,5,0.6,30
-    3,7,0.8,40
-    EOF
-    $ cat > test.csv << EOF
-    Years,Expertise,Trust,Salary
-    4,9,1.0,50
-    5,11,1.2,60
-    EOF
-    $ dffml train \
-        -model scikitlr \
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
-        -model-predict Salary:float:1 \
-        -sources f=csv \
-        -source-filename train.csv \
-        -log debug
-    $ dffml accuracy \
-        -model scikitlr \
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
-        -model-predict Salary:float:1 \
-        -sources f=csv \
-        -source-filename test.csv \
-        -log debug
     1.0
-    $ echo -e 'Years,Expertise,Trust\n6,13,1.4\n' | \
-      dffml predict all \
-        -model scikitlr \
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
-        -model-predict Salary:float:1 \
-        -sources f=csv \
-        -source-filename /dev/stdin \
-        -log debug
+
+Make a prediction
+
+.. literalinclude:: /../model/scikit/examples/lr/predict.sh
+
+Output:
+
+.. code-block:: json
+
     [
         {
             "extra": {},
@@ -749,46 +740,20 @@ Let us take a simple example:
                 "Trust": 1.4,
                 "Years": 6
             },
-            "last_updated": "2019-09-18T19:04:18Z",
+            "key": "0",
+            "last_updated": "2020-02-07T14:17:08Z",
             "prediction": {
-                "confidence": 1.0,
-                "value": 70.00000000000001
-            },
-            "key": 0
+                "Salary": {
+                    "confidence": 1.0,
+                    "value": 70.13972055888223
+                }
+            }
         }
     ]
 
 Example usage of Linear Regression Model using python API:
 
-.. code-block:: python
-
-    from dffml import CSVSource, Features, DefFeature
-    from dffml.noasync import train, accuracy, predict
-    from dffml_model_scikit import LinearRegressionModel
-
-    model = LinearRegressionModel(
-        features=Features(
-            DefFeature("Years", int, 1),
-            DefFeature("Expertise", int, 1),
-            DefFeature("Trust", float, 1),
-        ),
-        predict=DefFeature("Salary", int, 1),
-    )
-
-    # Train the model
-    train(model, "train.csv")
-
-    # Assess accuracy (alternate way of specifying data source)
-    print("Accuracy:", accuracy(model, CSVSource(filename="test.csv")))
-
-    # Make prediction
-    for i, features, prediction in predict(
-        model,
-        {"Years": 6, "Expertise": 13, "Trust": 0.7},
-        {"Years": 7, "Expertise": 15, "Trust": 0.8},
-    ):
-        features["Salary"] = prediction["Salary"]["value"]
-        print(features)
+.. literalinclude:: /../model/scikit/examples/lr/lr.py
 
 Example below uses KMeans Clustering Model on a small randomly generated dataset.
 
diff --git a/model/scikit/dffml_model_scikit/__init__.py b/model/scikit/dffml_model_scikit/__init__.py
@@ -122,7 +122,7 @@
 
 **Usage Example:**
 
-Example below uses LinearRegression Model on a small dataset.
+Example below uses LinearRegression Model using the command line.
 
 Let us take a simple example:
 
@@ -142,91 +142,57 @@
 |          5           |     11     |      1.2     |   60   |
 +----------------------+------------+--------------+--------+
 
-.. code-block:: console
+First we create the files
+
+.. literalinclude:: /../model/scikit/examples/lr/train_data.sh
+
+.. literalinclude:: /../model/scikit/examples/lr/test_data.sh
+
+Train the model
+
+.. literalinclude:: /../model/scikit/examples/lr/train.sh
+
+Assess accuracy
+
+.. literalinclude:: /../model/scikit/examples/lr/accuracy.sh
+
+Output:
+
+.. code-block::
 
-    $ cat > train.csv << EOF
-    Years,Expertise,Trust,Salary
-    0,1,0.2,10
-    1,3,0.4,20
-    2,5,0.6,30
-    3,7,0.8,40
-    EOF
-    $ cat > test.csv << EOF
-    Years,Expertise,Trust,Salary
-    4,9,1.0,50
-    5,11,1.2,60
-    EOF
-    $ dffml train \\
-        -model scikitlr \\
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \\
-        -model-predict Salary:float:1 \\
-        -sources f=csv \\
-        -source-filename train.csv \\
-        -log debug
-    $ dffml accuracy \\
-        -model scikitlr \\
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \\
-        -model-predict Salary:float:1 \\
-        -sources f=csv \\
-        -source-filename test.csv \\
-        -log debug
     1.0
-    $ echo -e 'Years,Expertise,Trust\\n6,13,1.4\\n' | \\
-      dffml predict all \\
-        -model scikitlr \\
-        -model-features Years:int:1 Expertise:int:1 Trust:float:1 \\
-        -model-predict Salary:float:1 \\
-        -sources f=csv \\
-        -source-filename /dev/stdin \\
-        -log debug
+
+Make a prediction
+
+.. literalinclude:: /../model/scikit/examples/lr/predict.sh
+
+Output:
+
+.. code-block:: json
+
     [
         {
             "extra": {},
             "features": {
                 "Expertise": 13,
-                "Trust": 1.4,
+                "Trust": 0.7,
                 "Years": 6
             },
-            "last_updated": "2019-09-18T19:04:18Z",
+            "key": "0",
+            "last_updated": "2020-03-01T22:26:46Z",
             "prediction": {
-                "confidence": 1.0,
-                "value": 70.00000000000001
-            },
-            "key": 0
+                "Salary": {
+                    "confidence": 1.0,
+                    "value": 70.0
+                }
+            }
         }
     ]
 
-Example usage of Linear Regression Model using python API:
-
-.. code-block:: python
 
-    from dffml import CSVSource, Features, DefFeature
-    from dffml.noasync import train, accuracy, predict
-    from dffml_model_scikit import LinearRegressionModel
-
-    model = LinearRegressionModel(
-        features=Features(
-            DefFeature("Years", int, 1),
-            DefFeature("Expertise", int, 1),
-            DefFeature("Trust", float, 1),
-        ),
-        predict=DefFeature("Salary", int, 1),
-    )
-
-    # Train the model
-    train(model, "train.csv")
-
-    # Assess accuracy (alternate way of specifying data source)
-    print("Accuracy:", accuracy(model, CSVSource(filename="test.csv")))
+Example usage of Linear Regression Model using python API:
 
-    # Make prediction
-    for i, features, prediction in predict(
-        model,
-        {"Years": 6, "Expertise": 13, "Trust": 0.7},
-        {"Years": 7, "Expertise": 15, "Trust": 0.8},
-    ):
-        features["Salary"] = prediction["Salary"]["value"]
-        print(features)
+.. literalinclude:: /../model/scikit/examples/lr/lr.py
 
 Example below uses KMeans Clustering Model on a small randomly generated dataset.
 
diff --git a/model/scikit/examples/lr/accuracy.sh b/model/scikit/examples/lr/accuracy.sh
@@ -0,0 +1,6 @@
+dffml accuracy \
+  -model scikitlr \
+  -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
+  -model-predict Salary:float:1 \
+  -sources f=csv \
+  -source-filename test.csv
diff --git a/model/scikit/examples/lr/lr.py b/model/scikit/examples/lr/lr.py
@@ -0,0 +1,27 @@
+from dffml import CSVSource, Features, DefFeature
+from dffml.noasync import train, accuracy, predict
+from dffml_model_scikit import LinearRegressionModel
+
+model = LinearRegressionModel(
+    features=Features(
+        DefFeature("Years", int, 1),
+        DefFeature("Expertise", int, 1),
+        DefFeature("Trust", float, 1),
+    ),
+    predict=DefFeature("Salary", int, 1),
+)
+
+# Train the model
+train(model, "train.csv")
+
+# Assess accuracy (alternate way of specifying data source)
+print("Accuracy:", accuracy(model, CSVSource(filename="test.csv")))
+
+# Make prediction
+for i, features, prediction in predict(
+    model,
+    {"Years": 6, "Expertise": 13, "Trust": 0.7},
+    {"Years": 7, "Expertise": 15, "Trust": 0.8},
+):
+    features["Salary"] = prediction["Salary"]["value"]
+    print(features)
diff --git a/model/scikit/examples/lr/predict.sh b/model/scikit/examples/lr/predict.sh
@@ -0,0 +1,7 @@
+echo -e 'Years,Expertise,Trust\n6,13,0.7\n' | \
+dffml predict all \
+  -model scikitlr \
+  -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
+  -model-predict Salary:float:1 \
+  -sources f=csv \
+  -source-filename /dev/stdin
diff --git a/model/scikit/examples/lr/test_data.sh b/model/scikit/examples/lr/test_data.sh
@@ -0,0 +1,5 @@
+cat > test.csv << EOF
+Years,Expertise,Trust,Salary
+4,9,0.5,50
+5,11,0.6,60
+EOF
diff --git a/model/scikit/examples/lr/test_lr.py b/model/scikit/examples/lr/test_lr.py
@@ -0,0 +1,60 @@
+import os
+import ast
+import sys
+import json
+import tempfile
+import contextlib
+import subprocess
+import unittest.mock
+
+from dffml.util.os import chdir
+
+
+def sh_filepath(filename):
+    return os.path.join(os.path.dirname(__file__), filename)
+
+
+@contextlib.contextmanager
+def directory_with_csv_files():
+    with tempfile.TemporaryDirectory() as tempdir:
+        with chdir(tempdir):
+            subprocess.check_output(["bash", sh_filepath("train_data.sh")])
+            subprocess.check_output(["bash", sh_filepath("test_data.sh")])
+            yield tempdir
+
+
+class TestExample(unittest.TestCase):
+    def python_test(self, filename):
+        # Path to target file
+        filepath = os.path.join(os.path.dirname(__file__), filename)
+        # Capture output
+        stdout = subprocess.check_output([sys.executable, filepath])
+        lines = stdout.decode().split("\n")
+        # Check the Accuracy
+        self.assertIn("Accuracy: 1.0", lines[0])
+        # Check the salary
+        self.assertEqual(round(ast.literal_eval(lines[1])["Salary"]), 70.0)
+        self.assertEqual(round(ast.literal_eval(lines[2])["Salary"]), 80.0)
+
+    def test_python_filenames(self):
+        with directory_with_csv_files() as tempdir:
+            self.python_test("lr.py")
+
+    def test_shell(self):
+        with directory_with_csv_files() as tempdir:
+            # Run training
+            subprocess.check_output(["bash", sh_filepath("train.sh")])
+            # Check the Accuracy
+            stdout = subprocess.check_output(
+                ["bash", sh_filepath("accuracy.sh")]
+            )
+            self.assertEqual(stdout.decode().strip(), "1.0")
+            # Make the prediction
+            stdout = subprocess.check_output(
+                ["bash", sh_filepath("predict.sh")]
+            )
+            records = json.loads(stdout.decode())
+            # Check the salary
+            self.assertEqual(
+                round(records[0]["prediction"]["Salary"]["value"]), 70.0
+            )
diff --git a/model/scikit/examples/lr/train.sh b/model/scikit/examples/lr/train.sh
@@ -0,0 +1,6 @@
+dffml train \
+  -model scikitlr \
+  -model-features Years:int:1 Expertise:int:1 Trust:float:1 \
+  -model-predict Salary:float:1 \
+  -sources f=csv \
+  -source-filename train.csv
diff --git a/model/scikit/examples/lr/train_data.sh b/model/scikit/examples/lr/train_data.sh