model: scratch: Alternate Logistic Regression implementation

Naman1233 · web-flow · commit fb6cb4e35c5d · 2020-03-17T12:18:42.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Docstrings and doctestable examples to `record.py`.
 - Inputs can be validated using operations
   - `validate` parameter in `Input` takes `Operation.instance_name`
+- Logistic Regression with SAG optimizer
 - Test tensorflow DNNEstimator documentation exaples in CI
 - Add python code for tensorflow DNNEstimator
 ### Fixed
diff --git a/docs/plugins/dffml_model.rst b/docs/plugins/dffml_model.rst
@@ -417,6 +417,28 @@ dffml_model_scratch
     pip install dffml-model-scratch
 
 
+scratchlgr
+~~~~~~~~~~
+
+*Official*
+
+No description
+
+**Args**
+
+- predict: Feature
+
+  - Label or the value to be predicted
+
+- features: List of features
+
+  - Features to train on
+
+- directory: Path
+
+  - default: ~/.cache/dffml/scratch
+  - Directory where state should be saved
+
 scratchslr
 ~~~~~~~~~~
 
diff --git a/model/scratch/dffml_model_scratch/logisticregression.py b/model/scratch/dffml_model_scratch/logisticregression.py
@@ -0,0 +1,139 @@
+import pathlib
+from typing import AsyncIterator, Tuple, Any
+
+import numpy as np
+
+from dffml import (
+    config,
+    field,
+    entrypoint,
+    SimpleModel,
+    ModelNotTrained,
+    Accuracy,
+    Feature,
+    Features,
+    Sources,
+    Record,
+)
+
+
+@config
+class LogisticRegressionConfig:
+    predict: Feature = field("Label or the value to be predicted")
+    features: Features = field("Features to train on")
+    directory: pathlib.Path = field(
+        "Directory where state should be saved",
+        default=pathlib.Path("~", ".cache", "dffml", "scratch"),
+    )
+
+
+@entrypoint("scratchlgr")
+class LogisticRegression(SimpleModel):
+
+    # The configuration class needs to be set as the CONFIG property
+    CONFIG = LogisticRegressionConfig
+    # Logistic Regression only supports training on a single feature
+    NUM_SUPPORTED_FEATURES = 1
+    # We only support single dimensional values, non-matrix / array
+    SUPPORTED_LENGTHS = [1]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.xData = np.array([])
+        self.yData = np.array([])
+
+    @property
+    def separating_line(self):
+        """
+        Load separating_line from disk, if it hasn't been set yet, return None
+        """
+        return self.storage.get("separating_line", None)
+
+    @separating_line.setter
+    def separating_line(self, rline):
+        """
+        Set separating_line in self.storage so it will be saved to disk
+        """
+        self.storage["separating_line"] = rline
+
+    def predict_input(self, x):
+        """
+        Use the regression
+        line to make a prediction by returning ``m * x + b``.
+        """
+        prediction = self.separating_line[0] * x + self.separating_line[1]
+        if prediction > 0.5:
+            prediction = 1
+        else:
+            prediction = 0
+        self.logger.debug(
+            "Predicted Value of {} {}:".format(
+                self.config.predict.NAME, prediction
+            )
+        )
+        return prediction
+
+    def best_fit_line(self):
+        self.logger.debug(
+            "Number of input records: {}".format(len(self.xData))
+        )
+        x = self.xData
+        y = self.yData
+        learning_rate = 0.01
+        w = 0.01
+        b = 0.0
+        for _ in range(1, 1500):
+            z = w * x + b
+            val = -np.multiply(y, z)
+            num = -np.multiply(y, np.exp(val))
+            den = 1 + np.exp(val)
+            f = num / den
+            gradJ = np.sum(x * f)
+            w = w - learning_rate * gradJ / len(x)
+        error = 0
+        for x_id in range(len(x)):
+            yhat = x[x_id] * w + b > 0.5
+            if yhat:
+                yhat = 1
+            else:
+                yhat = 0
+            if yhat != y[x_id]:
+                error += 1
+        accuracy = 1 - (error / len(x))
+        return (w, b, accuracy)
+
+    async def train(self, sources: Sources):
+        async for record in sources.with_features(
+            self.features + [self.config.predict.NAME]
+        ):
+            feature_data = record.features(
+                self.features + [self.config.predict.NAME]
+            )
+            self.xData = np.append(self.xData, feature_data[self.features[0]])
+            self.yData = np.append(
+                self.yData, feature_data[self.config.predict.NAME]
+            )
+        self.separating_line = self.best_fit_line()
+
+    async def accuracy(self, sources: Sources) -> Accuracy:
+        # Ensure the model has been trained before we try to make a prediction
+        if self.separating_line is None:
+            raise ModelNotTrained("Train model before assessing for accuracy.")
+        accuracy_value = self.separating_line[2]
+        return Accuracy(accuracy_value)
+
+    async def predict(
+        self, records: AsyncIterator[Record]
+    ) -> AsyncIterator[Tuple[Record, Any, float]]:
+        # Ensure the model has been trained before we try to make a prediction
+        if self.separating_line is None:
+            raise ModelNotTrained("Train model before prediction.")
+        target = self.config.predict.NAME
+        async for record in records:
+            feature_data = record.features(self.features)
+            record.predicted(
+                target,
+                self.predict_input(feature_data[self.features[0]]),
+                self.separating_line[2],
+            )
+            yield record
diff --git a/model/scratch/setup.py b/model/scratch/setup.py
@@ -65,5 +65,10 @@
     ],
     install_requires=INSTALL_REQUIRES,
     packages=find_packages(),
-    entry_points={"dffml.model": ["scratchslr = dffml_model_scratch.slr:SLR"]},
+    entry_points={
+        "dffml.model": [
+            "scratchslr = dffml_model_scratch.slr:SLR",
+            "scratchlgr = dffml_model_scratch.logisticregression:LogisticRegression",
+        ]
+    },
 )
diff --git a/model/scratch/tests/test_lgr.py b/model/scratch/tests/test_lgr.py
@@ -0,0 +1,84 @@
+import tempfile
+import unittest
+
+from dffml import train, accuracy, predict, DefFeature, Features, AsyncTestCase
+
+from dffml_model_scratch.logisticregression import (
+    LogisticRegressionConfig,
+    LogisticRegression,
+)
+
+TRAIN_DATA = [
+    [0.90, 0],
+    [0.22, 0],
+    [0.34, 0],
+    [0.09, 0],
+    [0.76, 0],
+    [0.29, 0],
+    [0.98, 0],
+    [0.47, 0],
+    [0.51, 1],
+    [0.60, 1],
+    [0.97, 1],
+    [0.82, 1],
+    [0.24, 1],
+    [0.19, 1],
+    [0.79, 1],
+    [0.92, 1],
+]
+
+TEST_DATA = [
+    [0.28, 1],
+    [0.94, 0],
+    [0.64, 1],
+    [0.37, 1],
+    [0.65, 0],
+    [0.09, 1],
+    [0.22, 0],
+]
+
+
+class TestLogisticRegression(AsyncTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Create a temporary directory to store the trained model
+        cls.model_dir = tempfile.TemporaryDirectory()
+        # Create the training data
+        cls.train_data = []
+        for x, y in TRAIN_DATA:
+            cls.train_data.append({"X": x, "Y": y})
+        # Create the test data
+        cls.test_data = []
+        for x, y in TEST_DATA:
+            cls.test_data.append({"X": x, "Y": y})
+        # Create an instance of the model
+        cls.model = LogisticRegression(
+            directory=cls.model_dir.name,
+            predict=DefFeature("Y", float, 1),
+            features=Features(DefFeature("X", float, 1)),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        # Remove the temporary directory where the trained model was stored
+        cls.model_dir.cleanup()
+
+    async def test_00_train(self):
+        # Train the model on the training data
+        await train(self.model, *self.train_data)
+
+    async def test_01_accuracy(self):
+        # Use the test data to assess the model's accuracy
+        res = await accuracy(self.model, *self.test_data)
+        # Ensure the accuracy is above 80%
+        self.assertTrue(0.0 <= res <= 1.0)
+
+    async def test_02_predict(self):
+        # Get the prediction for each piece of test data
+        async for i, features, prediction in predict(
+            self.model, *self.test_data
+        ):
+            # Grab the correct value
+            correct = self.test_data[i]["Y"]
+            # Grab the predicted value
+            prediction = prediction["Y"]["value"]
diff --git a/model/scratch/tests/test_lgr_integration.py b/model/scratch/tests/test_lgr_integration.py
@@ -0,0 +1,72 @@
+import csv
+import json
+import pathlib
+import contextlib
+
+from dffml.cli.cli import CLI
+from dffml.util.asynctestcase import IntegrationCLITestCase
+
+
+class TestLogisticRegression(IntegrationCLITestCase):
+    async def test_run(self):
+        # Make a temporary directory to store the model
+        directory = self.mktempdir()
+        # Create the csv data
+        d_temp = {True: 1, False: 0}
+        data_filename = self.mktempfile() + ".csv"
+        with open(pathlib.Path(data_filename), "w") as data_file:
+            writer = csv.writer(data_file, delimiter=",")
+            writer.writerow(["f1", "ans"])
+            writer.writerows(
+                [[i / 10, d_temp[i / 10 > 0.5]] for i in range(0, 10)]
+            )
+        # Arguments for the model
+        model_args = [
+            "-model",
+            "scratchlgr",
+            "-model-features",
+            "f1:int:1",
+            "-model-predict",
+            "ans:int:1",
+            "-model-directory",
+            directory,
+        ]
+        # Train the model
+        await CLI.cli(
+            "train",
+            *model_args,
+            "-sources",
+            "training_data=csv",
+            "-source-filename",
+            data_filename,
+        )
+        # Assess accuracy
+        await CLI.cli(
+            "accuracy",
+            *model_args,
+            "-sources",
+            "test_data=csv",
+            "-source-filename",
+            data_filename,
+        )
+        with contextlib.redirect_stdout(self.stdout):
+            # Make prediction
+            await CLI._main(
+                "predict",
+                "all",
+                *model_args,
+                "-sources",
+                "predict_data=csv",
+                "-source-filename",
+                data_filename,
+            )
+            results = json.loads(self.stdout.getvalue())
+            self.assertTrue(isinstance(results, list))
+            self.assertEqual(len(results), 10)
+            for i, result in enumerate(results):
+                self.assertIn("prediction", result)
+                result = result["prediction"]
+                self.assertIn("ans", result)
+                result = result["ans"]
+                self.assertIn("value", result)
+                result = result["value"]