skel: model: Convert to SimpleModel

pdxjohnny · pdxjohnny · commit 3a32d98fe76e · 2020-03-10T15:53:45.000-07:00
Signed-off-by: John Andersen &lt;johnandersenpdx@gmail.com&gt;
diff --git a/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py b/dffml/skel/model/REPLACE_IMPORT_PACKAGE_NAME/misc.py
@@ -1,60 +1,122 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2019 Intel Corporation
-"""
-Description of what this model does
-"""
-from typing import AsyncIterator, Tuple, Any, List
-
-from dffml.record import Record
-from dffml.source.source import Sources
-from dffml.feature import Features
-from dffml.model.accuracy import Accuracy
-from dffml.model.model import ModelContext, Model
-from dffml.util.entrypoint import entrypoint
-from dffml.base import config
+import pathlib
+import statistics
+from typing import AsyncIterator, Tuple, Any, Type, List
+
+from dffml import (
+    config,
+    field,
+    entrypoint,
+    SimpleModel,
+    ModelNotTrained,
+    Accuracy,
+    Feature,
+    Features,
+    Sources,
+    Record,
+)
+
+
+def matrix_subtract(one, two):
+    return [
+        one_element - two_element for one_element, two_element in zip(one, two)
+    ]
+
+
+def matrix_multiply(one, two):
+    return [
+        one_element * two_element for one_element, two_element in zip(one, two)
+    ]
+
+
+def squared_error(y, line):
+    return sum(map(lambda element: element ** 2, matrix_subtract(y, line)))
+
+
+def coeff_of_deter(y, regression_line):
+    y_mean_line = [statistics.mean(y)] * len(y)
+    squared_error_mean = squared_error(y, y_mean_line)
+    squared_error_regression = squared_error(y, regression_line)
+    return 1 - (squared_error_regression / squared_error_mean)
+
+
+def best_fit_line(x, y):
+    mean_x = statistics.mean(x)
+    mean_y = statistics.mean(y)
+    m = (mean_x * mean_y - statistics.mean(matrix_multiply(x, y))) / (
+        (mean_x ** 2) - statistics.mean(matrix_multiply(x, x))
+    )
+    b = mean_y - (m * mean_x)
+    regression_line = [m * x + b for x in x]
+    accuracy = coeff_of_deter(y, regression_line)
+    return (m, b, accuracy)
 
 
 @config
 class MiscModelConfig:
-    # This model never uses the directory, but chances are if you want to save
-    # and load data from disk you will need to
-    directory: str
-    classifications: List[str]
-    features: Features
+    predict: Feature = field("Label or the value to be predicted")
+    features: Features = field("Features to train on. For SLR only 1 allowed")
+    directory: pathlib.Path = field(
+        "Directory where state should be saved",
+        default=pathlib.Path("~", ".cache", "dffml", "miscmodel"),
+    )
 
 
-class MiscModelContext(ModelContext):
-    """
-    Model wraping model_name API
-    """
+@entrypoint("miscmodel")
+class MiscModel(SimpleModel):
+    # The configuration class needs to be set as the CONFIG property
+    CONFIG: Type[MiscModelConfig] = MiscModelConfig
+    # Simple Linear Regression only supports training on a single feature.
+    # Do not define NUM_SUPPORTED_FEATURES if you support arbitrary numbers of
+    # features.
+    NUM_SUPPORTED_FEATURES: int = 1
+    # We only support single dimensional values, non-matrix / array
+    # Do not define SUPPORTED_LENGTHS if you support arbitrary dimensions
+    SUPPORTED_LENGTHS: List[int] = [1]
 
-    async def train(self, sources: Sources):
-        """
-        Train using records as the data to learn from.
-        """
-        pass
+    async def train(self, sources: Sources) -> None:
+        # X and Y data
+        x = []
+        y = []
+        # Go through all records that have the feature we're training on and the
+        # feature we want to predict. Since our model only supports 1 feature,
+        # the self.features list will only have one element at index 0.
+        async for record in sources.with_features(
+            self.features + [self.config.predict.NAME]
+        ):
+            x.append(record.feature(self.features[0]))
+            y.append(record.feature(self.config.predict.NAME))
+        # Use self.logger to report how many records are being used for training
+        self.logger.debug("Number of input records: %d", len(x))
+        # Save m, b, and accuracy
+        self.storage["regression_line"] = best_fit_line(x, y)
 
     async def accuracy(self, sources: Sources) -> Accuracy:
-        """
-        Evaluates the accuracy of our model after training using the input records
-        as test data.
-        """
-        # Lies
-        return 1.0
+        # Load saved regression line
+        regression_line = self.storage.get("regression_line", None)
+        # Ensure the model has been trained before we try to make a prediction
+        if regression_line is None:
+            raise ModelNotTrained("Train model before assessing for accuracy.")
+        # Accuracy is the last element in regression_line, which is a list of
+        # three values: m, b, and accuracy.
+        return Accuracy(regression_line[2])
 
     async def predict(
         self, records: AsyncIterator[Record]
     ) -> AsyncIterator[Tuple[Record, Any, float]]:
-        """
-        Uses trained data to make a prediction about the quality of a record.
-        """
+        # Load saved regression line
+        regression_line = self.storage.get("regression_line", None)
+        # Ensure the model has been trained before we try to make a prediction
+        if regression_line is None:
+            raise ModelNotTrained("Train model before prediction.")
+        # Expand the regression_line into named variables
+        m, b, accuracy = regression_line
+        # Iterate through each record that needs a prediction
         async for record in records:
-            yield record, self.parent.config.classifications[
-                record.feature(self.parent.config.features.names()[0])
-            ], 1.0
-
-
-@entrypoint("misc")
-class MiscModel(Model):
-
-    CONTEXT = MiscModelContext
+            # Grab the x data from the record
+            x = record.feature(self.features[0])
+            # Calculate y
+            y = m * x + b
+            # Set the calculated value with the estimated accuracy
+            record.predicted(self.config.predict.NAME, y, accuracy)
+            # Yield the record to the caller
+            yield record
diff --git a/dffml/skel/model/tests/test_model.py b/dffml/skel/model/tests/test_model.py
@@ -1,88 +1,83 @@
-import random
 import tempfile
-from typing import Type
 
-from dffml.record import Record, RecordData
-from dffml.source.source import Sources
-from dffml.source.memory import MemorySource, MemorySourceConfig
-from dffml.feature import Data, Feature, Features
-from dffml.util.asynctestcase import AsyncTestCase
+from dffml import train, accuracy, predict, DefFeature, Features, AsyncTestCase
 
 from REPLACE_IMPORT_PACKAGE_NAME.misc import MiscModel, MiscModelConfig
 
+TRAIN_DATA = [
+    [12.4, 11.2],
+    [14.3, 12.5],
+    [14.5, 12.7],
+    [14.9, 13.1],
+    [16.1, 14.1],
+    [16.9, 14.8],
+    [16.5, 14.4],
+    [15.4, 13.4],
+    [17.0, 14.9],
+    [17.9, 15.6],
+    [18.8, 16.4],
+    [20.3, 17.7],
+    [22.4, 19.6],
+    [19.4, 16.9],
+    [15.5, 14.0],
+    [16.7, 14.6],
+]
 
-class StartsWithA(Feature):
+TEST_DATA = [
+    [17.3, 15.1],
+    [18.4, 16.1],
+    [19.2, 16.8],
+    [17.4, 15.2],
+    [19.5, 17.0],
+    [19.7, 17.2],
+    [21.2, 18.6],
+]
 
-    NAME: str = "starts_with_a"
 
-    def dtype(self) -> Type:
-        return int
-
-    def length(self) -> int:
-        return 1
-
-    async def calc(self, data: Data) -> int:
-        return 1 if data.key.lower().startswith("a") else 0
-
-
-class TestMisc(AsyncTestCase):
+class TestMiscModel(AsyncTestCase):
     @classmethod
     def setUpClass(cls):
-        cls.feature = StartsWithA()
-        cls.features = Features(cls.feature)
+        # Create a temporary directory to store the trained model
         cls.model_dir = tempfile.TemporaryDirectory()
+        # Create the training data
+        cls.train_data = []
+        for x, y in TRAIN_DATA:
+            cls.train_data.append({"X": x, "Y": y})
+        # Create the test data
+        cls.test_data = []
+        for x, y in TEST_DATA:
+            cls.test_data.append({"X": x, "Y": y})
+        # Create an instance of the model
         cls.model = MiscModel(
-            MiscModelConfig(
-                directory=cls.model_dir.name,
-                classifications=["not a", "a"],
-                features=cls.features,
-            )
-        )
-        cls.records = [
-            Record(
-                "a" + str(random.random()),
-                data={"features": {cls.feature.NAME: 1, "string": "a"}},
-            )
-            for _ in range(0, 1000)
-        ]
-        cls.records += [
-            Record(
-                "b" + str(random.random()),
-                data={"features": {cls.feature.NAME: 0, "string": "not a"}},
-            )
-            for _ in range(0, 1000)
-        ]
-        cls.sources = Sources(
-            MemorySource(MemorySourceConfig(records=cls.records))
+            directory=cls.model_dir.name,
+            predict=DefFeature("Y", float, 1),
+            features=Features(DefFeature("X", float, 1)),
         )
 
     @classmethod
     def tearDownClass(cls):
+        # Remove the temporary directory where the trained model was stored
         cls.model_dir.cleanup()
 
     async def test_00_train(self):
-        async with self.sources as sources, self.model as model:
-            async with sources() as sctx, model() as mctx:
-                await mctx.train(sctx)
+        # Train the model on the training data
+        await train(self.model, *self.train_data)
 
     async def test_01_accuracy(self):
-        async with self.sources as sources, self.model as model:
-            async with sources() as sctx, model() as mctx:
-                res = await mctx.accuracy(sctx)
-                self.assertGreater(res, 0.9)
+        # Use the test data to assess the model's accuracy
+        res = await accuracy(self.model, *self.test_data)
+        # Ensure the accuracy is above 80%
+        self.assertTrue(0.8 <= res < 1.0)
 
     async def test_02_predict(self):
-        a = Record("a", data={"features": {self.feature.NAME: 1}})
-        b = Record("not a", data={"features": {self.feature.NAME: 0}})
-        async with Sources(
-            MemorySource(MemorySourceConfig(records=[a, b]))
-        ) as sources, self.model as model:
-            async with sources() as sctx, model() as mctx:
-                num = 0
-                async for record, prediction, confidence in mctx.predict(
-                    sctx.records()
-                ):
-                    with self.subTest(record=record):
-                        self.assertEqual(prediction, record.key)
-                    num += 1
-                self.assertEqual(num, 2)
+        # Get the prediction for each piece of test data
+        async for i, features, prediction in predict(
+            self.model, *self.test_data
+        ):
+            # Grab the correct value
+            correct = self.test_data[i]["Y"]
+            # Grab the predicted value
+            prediction = prediction["Y"]["value"]
+            # Check that the percent error is less than 10%
+            self.assertLess(prediction, correct * 1.1)
+            self.assertGreater(prediction, correct * (1.0 - 0.1))