docs: model: daal4py: Add example usage for Linear Regression

mhash1m · web-flow · commit 9f485608653c · 2020-07-23T09:02:13.000-07:00
Fixes: #692 Signed-off-by: <johnandersenpdx@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   core plugins
 - HTTP service got a `-redirect` flag which allows for URL redirection via a
   HTTP 307 response
+- Daal4py example usage.
 ### Changed
 - Renamed `-seed` to `-inputs` in `dataflow create` command
 - Renamed configloader/png to configloader/image and added support for loading JPEG and TIFF file formats
diff --git a/model/daal4py/dffml_model_daal4py/daal4pylr.py b/model/daal4py/dffml_model_daal4py/daal4pylr.py
@@ -28,6 +28,65 @@ class DAAL4PyLRModelConfig:
 
 @entrypoint("daal4pylr")
 class DAAL4PyLRModel(SimpleModel):
+    """
+    Implemented using daal4py.
+
+    First we create the training and testing datasets
+
+    .. literalinclude:: /../model/daal4py/examples/lr/train_data.sh
+
+    .. literalinclude:: /../model/daal4py/examples/lr/test_data.sh
+
+    Train the model
+
+    .. literalinclude:: /../model/daal4py/examples/lr/train.sh
+
+    Assess the accuracy
+
+    .. literalinclude:: /../model/daal4py/examples/lr/accuracy.sh
+
+    Output
+
+    .. code-block::
+
+        0.6666666666666666
+
+
+    Make a prediction
+
+    .. literalinclude:: /../model/daal4py/examples/lr/predict.sh
+
+    Output
+
+    .. code-block:: json
+
+        [
+            {
+                "extra": {},
+                "features": {
+                    "ans": 1,
+                    "f1": 0.8
+                },
+                "key": "0",
+                "last_updated": "2020-07-22T02:53:11Z",
+                "prediction": {
+                    "ans": {
+                        "confidence": NaN,
+                        "value": 1.1907472649730522
+                    }
+                }
+            }
+        ]
+
+
+
+
+
+    Example usage of daal4py Linear Regression model using python API
+
+    .. literalinclude:: /../model/daal4py/examples/lr/textclassifier.py
+    """
+
     CONFIG = DAAL4PyLRModelConfig
 
     def __init__(self, config) -> None:
@@ -114,6 +173,10 @@ async def predict(
             predict = self.pd.DataFrame(feature_data, index=[0])
             preds = self.lm_predictor.compute(predict, self.lm_trained)
             target = self.parent.config.predict.name
-            record.predicted(target, preds.prediction, float("nan"))
+            if preds.prediction.size == 1:
+                prediction = preds.prediction.flat[0]
+            else:
+                prediction = preds.prediction
+            record.predicted(target, prediction, float("nan"))
             # Yield the record to the caller
             yield record
diff --git a/model/daal4py/examples/lr/__init__.py b/model/daal4py/examples/lr/__init__.py
diff --git a/model/daal4py/examples/lr/accuracy.sh b/model/daal4py/examples/lr/accuracy.sh
@@ -0,0 +1,7 @@
+dffml accuracy \
+  -model daal4pylr \
+  -model-features f1:float:1 \
+  -model-predict ans:int:1 \
+  -model-directory tempdir \
+  -sources f=csv \
+  -source-filename test.csv
diff --git a/model/daal4py/examples/lr/lr.py b/model/daal4py/examples/lr/lr.py
@@ -0,0 +1,20 @@
+from dffml import CSVSource, Features, Feature
+from dffml.noasync import train, accuracy, predict
+from dffml_model_daal4py.daal4pylr import DAAL4PyLRModel
+
+model = DAAL4PyLRModel(
+    features=Features(Feature("f1", float, 1)),
+    predict=Feature("ans", int, 1),
+    directory="tempdir",
+)
+
+# Train the model
+train(model, "train.csv")
+
+# Assess accuracy (alternate way of specifying data source)
+print("Accuracy:", accuracy(model, CSVSource(filename="test.csv")))
+
+# Make prediction
+for i, features, prediction in predict(model, {"f1": 0.8, "ans": 0}):
+    features["ans"] = prediction["ans"]["value"]
+    print(features)
diff --git a/model/daal4py/examples/lr/predict.sh b/model/daal4py/examples/lr/predict.sh
@@ -0,0 +1,8 @@
+echo -e 'f1,ans\n0.8,1\n' | \
+  dffml predict all \
+  -model daal4pylr \
+  -model-features f1:float:1 \
+  -model-predict ans:int:1 \
+  -model-directory tempdir \
+  -sources f=csv \
+  -source-filename /dev/stdin
diff --git a/model/daal4py/examples/lr/test_data.sh b/model/daal4py/examples/lr/test_data.sh
@@ -0,0 +1,9 @@
+cat > test.csv << EOF
+f1,ans
+18.8,16.4
+20.3,17.7
+22.4,19.6
+19.4,16.9
+15.5,14.0
+16.7,14.6
+EOF
diff --git a/model/daal4py/examples/lr/test_lr.py b/model/daal4py/examples/lr/test_lr.py
@@ -0,0 +1,63 @@
+import os
+import ast
+import sys
+import json
+import tempfile
+import contextlib
+import subprocess
+import unittest.mock
+import logging, sys
+
+import numpy as np
+
+from dffml.util.os import chdir
+
+
+def sh_filepath(filename):
+    return os.path.join(os.path.dirname(__file__), filename)
+
+
+@contextlib.contextmanager
+def directory_with_csv_files():
+    with tempfile.TemporaryDirectory() as tempdir:
+        with chdir(tempdir):
+            subprocess.check_output(["bash", sh_filepath("train_data.sh")])
+            subprocess.check_output(["bash", sh_filepath("test_data.sh")])
+            yield tempdir
+
+
+class TestExample(unittest.TestCase):
+    def python_test(self, filename):
+        # Path to target file
+        filepath = os.path.join(os.path.dirname(__file__), filename)
+        # Capture output
+        stdout = subprocess.check_output([sys.executable, filepath])
+        lines = stdout.decode().split("\n")
+        # Check the Accuracy
+        self.assertRegex(lines[0], r"Accuracy:  [-+]?\d*\.?\d+|\d+")
+        # Check the ans
+        self.assertIsInstance(ast.literal_eval(lines[1])["ans"], float)
+
+    def test_python_filenames(self):
+        with directory_with_csv_files() as tempdir:
+            self.python_test("lr.py")
+
+    def test_shell(self):
+        with directory_with_csv_files() as tempdir:
+            # Run training
+            subprocess.check_output(["bash", sh_filepath("train.sh")])
+            # Check the Accuracy
+            stdout = subprocess.check_output(
+                ["bash", sh_filepath("accuracy.sh")]
+            )
+            lines = stdout.decode().split("\n")
+            self.assertRegex(lines[0], r"[-+]?\d*\.?\d+|\d+")
+            # Make the prediction
+            stdout = subprocess.check_output(
+                ["bash", sh_filepath("predict.sh")]
+            )
+            records = json.loads(stdout.decode())
+            # Check the ans
+            self.assertIsInstance(
+                records[0]["prediction"]["ans"]["value"], float
+            )
diff --git a/model/daal4py/examples/lr/train.sh b/model/daal4py/examples/lr/train.sh
@@ -0,0 +1,7 @@
+dffml train \
+  -model daal4pylr \
+  -model-features f1:float:1 \
+  -model-predict ans:int:1 \
+  -model-directory tempdir \
+  -sources f=csv \
+  -source-filename train.csv
diff --git a/model/daal4py/examples/lr/train_data.sh b/model/daal4py/examples/lr/train_data.sh
@@ -0,0 +1,19 @@
+cat >train.csv << EOF
+f1,ans
+12.4,11.2
+14.3,12.5
+14.5,12.7
+14.9,13.1
+16.1,14.1
+16.9,14.8
+16.5,14.4
+15.4,13.4
+17.0,14.9
+17.9,15.6
+18.8,16.4
+20.3,17.7
+22.4,19.6
+19.4,16.9
+15.5,14.0
+16.7,14.6
+EOF
diff --git a/model/daal4py/tests/test_lr.py b/model/daal4py/tests/test_lr.py
@@ -24,13 +24,13 @@
 ]
 
 TEST_DATA = [
-    [17.3, 15.1],
-    [18.4, 16.1],
-    [19.2, 16.8],
-    [17.4, 15.2],
-    [19.5, 17.0],
-    [19.7, 17.2],
-    [21.2, 18.6],
+    [17.9, 15.6],
+    [18.8, 16.4],
+    [20.3, 17.7],
+    [22.4, 19.6],
+    [19.4, 16.9],
+    [15.5, 14.0],
+    [16.7, 14.6],
 ]