model: Move scratch slr into main pacakge

mhash1m · web-flow · commit e1cd1125a4d9 · 2020-03-31T13:06:10.000-07:00
- Moved SLR into the main dffml package - Removed scratch:slr Fixes: #500 Fixes: #499
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Treat `"~"` as the the home directory rather than a literal
 - Windows support by selecting `asyncio.ProactorEventLoop` and not using
   `asyncio.FastChildWatcher`.
+- Moved SLR into the main dffml package and removed `scratch:slr`.
 
 ## [0.3.5] - 2020-03-10
 ### Added
diff --git a/dffml/model/slr.py b/dffml/model/slr.py
@@ -0,0 +1,122 @@
+import pathlib
+import statistics
+from typing import AsyncIterator, Tuple, Any, Type, List
+
+from dffml import (
+    config,
+    field,
+    entrypoint,
+    SimpleModel,
+    ModelNotTrained,
+    Accuracy,
+    Feature,
+    Features,
+    Sources,
+    Record,
+)
+
+
+def matrix_subtract(one, two):
+    return [
+        one_element - two_element for one_element, two_element in zip(one, two)
+    ]
+
+
+def matrix_multiply(one, two):
+    return [
+        one_element * two_element for one_element, two_element in zip(one, two)
+    ]
+
+
+def squared_error(y, line):
+    return sum(map(lambda element: element ** 2, matrix_subtract(y, line)))
+
+
+def coeff_of_deter(y, regression_line):
+    y_mean_line = [statistics.mean(y)] * len(y)
+    squared_error_mean = squared_error(y, y_mean_line)
+    squared_error_regression = squared_error(y, regression_line)
+    return 1 - (squared_error_regression / squared_error_mean)
+
+
+def best_fit_line(x, y):
+    mean_x = statistics.mean(x)
+    mean_y = statistics.mean(y)
+    m = (mean_x * mean_y - statistics.mean(matrix_multiply(x, y))) / (
+        (mean_x ** 2) - statistics.mean(matrix_multiply(x, x))
+    )
+    b = mean_y - (m * mean_x)
+    regression_line = [m * x + b for x in x]
+    accuracy = coeff_of_deter(y, regression_line)
+    return (m, b, accuracy)
+
+
+@config
+class SLRModelConfig:
+    predict: Feature = field("Label or the value to be predicted")
+    features: Features = field("Features to train on. For SLR only 1 allowed")
+    directory: pathlib.Path = field(
+        "Directory where state should be saved",
+        default=pathlib.Path("~", ".cache", "dffml", "slr"),
+    )
+
+
+@entrypoint("slr")
+class SLRModel(SimpleModel):
+    # The configuration class needs to be set as the CONFIG property
+    CONFIG: Type[SLRModelConfig] = SLRModelConfig
+    # Simple Linear Regression only supports training on a single feature.
+    # Do not define NUM_SUPPORTED_FEATURES if you support arbitrary numbers of
+    # features.
+    NUM_SUPPORTED_FEATURES: int = 1
+    # We only support single dimensional values, non-matrix / array
+    # Do not define SUPPORTED_LENGTHS if you support arbitrary dimensions
+    SUPPORTED_LENGTHS: List[int] = [1]
+
+    async def train(self, sources: Sources) -> None:
+        # X and Y data
+        x = []
+        y = []
+        # Go through all records that have the feature we're training on and the
+        # feature we want to predict. Since our model only supports 1 feature,
+        # the self.features list will only have one element at index 0.
+        async for record in sources.with_features(
+            self.features + [self.config.predict.NAME]
+        ):
+            x.append(record.feature(self.features[0]))
+            y.append(record.feature(self.config.predict.NAME))
+        # Use self.logger to report how many records are being used for training
+        self.logger.debug("Number of input records: %d", len(x))
+        # Save m, b, and accuracy
+        self.storage["regression_line"] = best_fit_line(x, y)
+
+    async def accuracy(self, sources: Sources) -> Accuracy:
+        # Load saved regression line
+        regression_line = self.storage.get("regression_line", None)
+        # Ensure the model has been trained before we try to make a prediction
+        if regression_line is None:
+            raise ModelNotTrained("Train model before assessing for accuracy.")
+        # Accuracy is the last element in regression_line, which is a list of
+        # three values: m, b, and accuracy.
+        return Accuracy(regression_line[2])
+
+    async def predict(
+        self, records: AsyncIterator[Record]
+    ) -> AsyncIterator[Tuple[Record, Any, float]]:
+        # Load saved regression line
+        regression_line = self.storage.get("regression_line", None)
+        # Ensure the model has been trained before we try to make a prediction
+        if regression_line is None:
+            raise ModelNotTrained("Train model before prediction.")
+        # Expand the regression_line into named variables
+        m, b, accuracy = regression_line
+        # Iterate through each record that needs a prediction
+        async for record in records:
+            # Grab the x data from the record
+            x = record.feature(self.features[0])
+            # Calculate y
+            y = m * x + b
+            # Set the calculated value with the estimated accuracy
+            record.predicted(self.config.predict.NAME, y, accuracy)
+            # Yield the record to the caller
+            yield record
diff --git a/model/scratch/dffml_model_scratch/slr.py b/model/scratch/dffml_model_scratch/slr.py
diff --git a/model/scratch/setup.py b/model/scratch/setup.py
@@ -67,7 +67,6 @@
     packages=find_packages(),
     entry_points={
         "dffml.model": [
-            "scratchslr = dffml_model_scratch.slr:SLR",
             "scratchlgrsag = dffml_model_scratch.logisticregression:LogisticRegression",
         ]
     },
diff --git a/model/scratch/tests/test_slr.py b/model/scratch/tests/test_slr.py
@@ -3,7 +3,7 @@
 
 from dffml import train, accuracy, predict, DefFeature, Features, AsyncTestCase
 
-from dffml_model_scratch.slr import SLR, SLRConfig
+from dffml.model.slr import SLRModel, SLRModelConfig
 
 TRAIN_DATA = [
     [12.4, 11.2],
@@ -49,7 +49,7 @@ def setUpClass(cls):
         for x, y in TEST_DATA:
             cls.test_data.append({"X": x, "Y": y})
         # Create an instance of the model
-        cls.model = SLR(
+        cls.model = SLRModel(
             directory=cls.model_dir.name,
             predict=DefFeature("Y", float, 1),
             features=Features(DefFeature("X", float, 1)),
diff --git a/model/scratch/tests/test_slr_integration.py b/model/scratch/tests/test_slr_integration.py
@@ -20,7 +20,7 @@ async def test_run(self):
         # Arguments for the model
         model_args = [
             "-model",
-            "scratchslr",
+            "slr",
             "-model-features",
             "Years:int:1",
             "-model-predict",
diff --git a/setup.py b/setup.py
@@ -109,5 +109,7 @@
         "dffml.orchestrator": ["memory = dffml.df.memory:MemoryOrchestrator"],
         # Databases
         "dffml.db": ["sqlite = dffml.db.sqlite:SqliteDatabase"],
+        # Models
+        "dffml.model": ["slr = dffml.model.slr:SLRModel"],
     },
 )
diff --git a/tests/integration/test_service_dev.py b/tests/integration/test_service_dev.py
@@ -44,7 +44,7 @@ async def test_run(self):
         await CLI.cli(
             "train",
             "-model",
-            "scratchslr",
+            "slr",
             "-model-features",
             "Years:int:1",
             "-model-predict",
@@ -63,7 +63,7 @@ async def test_run(self):
             "-features",
             json.dumps({"Years": 6}),
             "-config-model",
-            "scratchslr",
+            "slr",
             "-config-model-features",
             "Years:int:1",
             "-config-model-predict",

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,6 @@`
`67`	`67`	`packages=find_packages(),`
`68`	`68`	`entry_points={`
`69`	`69`	`"dffml.model": [`
`70`		`- "scratchslr = dffml_model_scratch.slr:SLR",`
`71`	`70`	`"scratchlgrsag = dffml_model_scratch.logisticregression:LogisticRegression",`
`72`	`71`	`]`
`73`	`72`	`},`
Original file line number	Diff line number	Diff line change
`@@ -109,5 +109,7 @@`
`109`	`109`	`"dffml.orchestrator": ["memory = dffml.df.memory:MemoryOrchestrator"],`
`110`	`110`	`# Databases`
`111`	`111`	`"dffml.db": ["sqlite = dffml.db.sqlite:SqliteDatabase"],`
	`112`	`+ # Models`
	`113`	`+ "dffml.model": ["slr = dffml.model.slr:SLRModel"],`
`112`	`114`	`},`
`113`	`115`	`)`