make release-tag: Merge branch 'master' into stable

csala · csala · commit fc563be0d350 · 2020-01-30T13:27:39.000-05:00
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,23 +1,39 @@
 # History
 
+## 0.2.4 - 2020-01-30
+
+### New Primitives
+
+* Add RangeScaler and RangeUnscaler primitives - [Issue #232](https://github.com/HDI-Project/MLPrimitives/issues/232) by @csala
+
+### Primitive Improvements
+
+* Extract input_shape from X in keras.Sequential - [Issue #223](https://github.com/HDI-Project/MLPrimitives/issues/223) by @csala
+
+### Bug Fixes
+
+* mlprimitives.custom.text.TextCleaner fails if text is empty - [Issue #228](https://github.com/HDI-Project/MLPrimitives/issues/228) by @csala
+* Error when loading the reviews dataset - [Issue #230](https://github.com/HDI-Project/MLPrimitives/issues/230) by @csala
+* Curate dependencies: specify an explicit prompt-toolkit version range - [Issue #224](https://github.com/HDI-Project/MLPrimitives/issues/224) by @csala
+
 ## 0.2.3 - 2019-11-14
 
 ### New Primitives
 
-Add primitive to make window_sequences based on cutoff times - [Issue #217](https://github.com/HDI-Project/MLPrimitives/issues/217) by @csala
-Create a keras LSTM based TimeSeriesClassifier primitive - [Issue #218](https://github.com/HDI-Project/MLPrimitives/issues/218) by @csala
-Add pandas DataFrame primitives - [Issue #214](https://github.com/HDI-Project/MLPrimitives/issues/214) by @csala
-Add featuretools.EntitySet.normalize_entity primitive - [Issue #209](https://github.com/HDI-Project/MLPrimitives/issues/209) by @csala
+* Add primitive to make window_sequences based on cutoff times - [Issue #217](https://github.com/HDI-Project/MLPrimitives/issues/217) by @csala
+* Create a keras LSTM based TimeSeriesClassifier primitive - [Issue #218](https://github.com/HDI-Project/MLPrimitives/issues/218) by @csala
+* Add pandas DataFrame primitives - [Issue #214](https://github.com/HDI-Project/MLPrimitives/issues/214) by @csala
+* Add featuretools.EntitySet.normalize_entity primitive - [Issue #209](https://github.com/HDI-Project/MLPrimitives/issues/209) by @csala
 
 ### Primitive Improvements
 
-Make featuretools.EntitySet.entity_from_dataframe entityset arg optional - [Issue #208](https://github.com/HDI-Project/MLPrimitives/issues/208) by @csala
+* Make featuretools.EntitySet.entity_from_dataframe entityset arg optional - [Issue #208](https://github.com/HDI-Project/MLPrimitives/issues/208) by @csala
 
-Add text regression dataset - [Issue #206](https://github.com/HDI-Project/MLPrimitives/issues/206) by @csala
+* Add text regression dataset - [Issue #206](https://github.com/HDI-Project/MLPrimitives/issues/206) by @csala
 
 ### Bug Fixes
 
-pandas.DataFrame.resample crash when grouping by integer columns - [Issue #211](https://github.com/HDI-Project/MLPrimitives/issues/211) by @csala
+* pandas.DataFrame.resample crash when grouping by integer columns - [Issue #211](https://github.com/HDI-Project/MLPrimitives/issues/211) by @csala
 
 ## 0.2.2 - 2019-10-08
 
diff --git a/mlprimitives/__init__.py b/mlprimitives/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'MIT Data To AI Lab'
 __email__ = 'dailabmit@gmail.com'
-__version__ = '0.2.3'
+__version__ = '0.2.4.dev1'
 
 import os
 
diff --git a/mlprimitives/adapters/keras.py b/mlprimitives/adapters/keras.py
@@ -78,7 +78,24 @@ def __init__(self, layers, loss, optimizer, classification, callbacks=tuple(),
 
         self.callbacks = callbacks
 
+    def _setdefault(self, kwargs, key, value):
+        if key in kwargs:
+            return
+
+        if key in self.hyperparameters and self.hyperparameters[key] is None:
+            kwargs[key] = value
+
+    def _augment_hyperparameters(self, X, kwargs):
+        shape = np.asarray(X)[0].shape
+        length = shape[0]
+        self._setdefault(kwargs, 'input_shape', shape)
+        self._setdefault(kwargs, 'input_dim', length)
+        self._setdefault(kwargs, 'input_length', length)
+
+        return kwargs
+
     def fit(self, X, y, **kwargs):
+        self._augment_hyperparameters(X, kwargs)
         self.model = self._build_model(**kwargs)
 
         if self.classification:
diff --git a/mlprimitives/cli.py b/mlprimitives/cli.py
@@ -8,6 +8,7 @@
 import sys
 import warnings
 
+import pandas as pd
 from mlblocks import add_primitives_path, get_primitives_paths
 
 from mlprimitives.evaluation import score_pipeline
@@ -18,7 +19,7 @@
 def _logging_setup(verbosity=1):
     logger = logging.getLogger()
     log_level = (3 - verbosity) * 10
-    fmt = '%(asctime)s - %(levelname)s - %(message)s'
+    fmt = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
     formatter = logging.Formatter(fmt)
     logger.setLevel(log_level)
     logger.propagate = False
@@ -30,10 +31,36 @@ def _logging_setup(verbosity=1):
 
 
 def _test(args):
-    for pipeline in args.pipeline:
-        print('Scoring pipeline: {}'.format(pipeline))
-        score, stdev = score_pipeline(pipeline, args.splits, args.random_state, args.dataset)
-        print('Obtained Score: {:.4f} +/- {:.4f}'.format(score, stdev))
+    results = pd.DataFrame(columns=['pipeline', 'mean', 'std', 'error'])
+    try:
+        for pipeline in args.pipeline:
+            print('Scoring pipeline: {}'.format(pipeline))
+            pipeline_name = os.path.basename(pipeline)
+            try:
+                score, std = score_pipeline(
+                    pipeline,
+                    args.splits,
+                    args.random_state,
+                    args.dataset
+                )
+
+                print('Obtained Score: {:.4f} +/- {:.4f}'.format(score, std))
+                results = results.append({
+                    'pipeline': pipeline_name,
+                    'mean': score,
+                    'std': std,
+                }, ignore_index=True)
+
+            except Exception as ex:
+                results = results.append({
+                    'pipeline': pipeline_name,
+                    'error': ex,
+                }, ignore_index=True)
+
+    except KeyboardInterrupt:
+        pass
+
+    print(results.to_string(index=False))
 
 
 def _get_primitives(pattern):
diff --git a/mlprimitives/custom/preprocessing.py b/mlprimitives/custom/preprocessing.py
@@ -28,3 +28,41 @@ def fit(self, classes):
 
     def decode(self, y):
         return self._label_encoder.inverse_transform(y)
+
+
+class RangeScaler():
+
+    _data_min = None
+    _data_scale = None
+    _data_range = None
+
+    def __init__(self, out_min, out_max):
+        self._out_min = out_min
+        self._out_scale = out_max - out_min
+
+    def fit(self, X):
+        data_max = X.max(axis=0)
+        self._data_min = X.min(axis=0)
+        self._data_scale = data_max - self._data_min
+        self._data_range = (self._data_min, data_max)
+
+    def scale(self, X):
+        scaled = (X - self._data_min) / self._data_scale
+        rescaled = (scaled * self._out_scale) + self._out_min
+
+        return rescaled, self._data_range
+
+
+class RangeUnscaler():
+
+    def __init__(self, out_min, out_max):
+        self._out_min = out_min
+        self._out_scale = out_max - out_min
+
+    def fit(self, data_range):
+        self._data_min = data_range[0]
+        self._data_scale = data_range[1] - self._data_min
+
+    def unscale(self, X):
+        unscaled = (X - self._out_min) / self._out_scale
+        return (unscaled * self._data_scale) + self._data_min
diff --git a/mlprimitives/custom/text.py b/mlprimitives/custom/text.py
@@ -82,6 +82,9 @@ def get_stopwords(cls, language_code):
         return []
 
     def _remove_stopwords(self, text):
+        if text == '':
+            return text
+
         if self.language_code:
             language_code = self.language_code
 
diff --git a/mlprimitives/datasets.py b/mlprimitives/datasets.py
@@ -327,7 +327,7 @@ def load_reviews():
     X = _load_csv(dataset_path, 'data')
     y = X.pop('evaluation').values
 
-    return Dataset(load_reviews.__doc__, X, y, r2_score)
+    return Dataset(load_reviews.__doc__, X, y, r2_score, 'text', 'regression', 'univariate')
 
 
 def load_umls():
diff --git a/mlprimitives/pipelines/keras.Sequential.LSTMTextClassifier.json b/mlprimitives/pipelines/keras.Sequential.LSTMTextClassifier.json
@@ -39,7 +39,7 @@
             "maxlen": 100
         },
         "keras.Sequential.LSTMTextClassifier#1": {
-            "input_length": 100,
+            "epochs": 1,
             "verbose": true,
             "validation_split": 0.2,
             "callbacks": [
diff --git a/mlprimitives/pipelines/keras.Sequential.MLPBinaryClassifier.json b/mlprimitives/pipelines/keras.Sequential.MLPBinaryClassifier.json
@@ -11,7 +11,6 @@
     "primitives": [
         "mlprimitives.custom.preprocessing.ClassEncoder",
         "mlprimitives.custom.feature_extraction.CategoricalEncoder",
-        "mlprimitives.custom.counters.count_features",
         "keras.Sequential.MLPBinaryClassifier",
         "mlprimitives.custom.preprocessing.ClassDecoder"
     ]
diff --git a/mlprimitives/pipelines/keras.Sequential.MLPMultiClassClassifier.json b/mlprimitives/pipelines/keras.Sequential.MLPMultiClassClassifier.json
@@ -10,7 +10,6 @@
     },
     "primitives": [
         "mlprimitives.custom.counters.UniqueCounter",
-        "mlprimitives.custom.counters.count_features",
         "keras.Sequential.MLPMultiClassClassifier"
     ],
     "input_names": {
diff --git a/mlprimitives/primitives/keras.Sequential.DoubleLSTMTimeSeriesClassifier.json b/mlprimitives/primitives/keras.Sequential.DoubleLSTMTimeSeriesClassifier.json
@@ -75,10 +75,7 @@
             },
             "input_shape": {
                 "type": "tuple",
-                "default": [
-                    250,
-                    1
-                ]
+                "default": null
             },
             "dense_units": {
                 "type": "int",
diff --git a/mlprimitives/primitives/keras.Sequential.LSTMTextClassifier.json b/mlprimitives/primitives/keras.Sequential.LSTMTextClassifier.json
@@ -53,7 +53,8 @@
     "hyperparameters": {
         "fixed": {
             "input_length": {
-                "type": "int"
+                "type": "int",
+                "default": null
             },
             "classification": {
                 "type": "bool",
diff --git a/mlprimitives/primitives/keras.Sequential.LSTMTextRegressor.json b/mlprimitives/primitives/keras.Sequential.LSTMTextRegressor.json
@@ -44,7 +44,7 @@
         "fixed": {
             "input_length": {
                 "type": "int",
-                "default": 1500
+                "default": null
             },
             "classification": {
                 "type": "bool",
diff --git a/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesClassifier.json b/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesClassifier.json
@@ -75,10 +75,7 @@
             },
             "input_shape": {
                 "type": "tuple",
-                "default": [
-                    250,
-                    1
-                ]
+                "default": null
             },
             "dense_units": {
                 "type": "int",
diff --git a/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesRegressor.json b/mlprimitives/primitives/keras.Sequential.LSTMTimeSeriesRegressor.json
@@ -77,10 +77,7 @@
             },
             "input_shape": {
                 "type": "tuple",
-                "default": [
-                    250,
-                    1
-                ]
+                "default": null
             },
             "dense_units": {
                 "type": "int",
diff --git a/mlprimitives/primitives/keras.Sequential.MLPBinaryClassifier.json b/mlprimitives/primitives/keras.Sequential.MLPBinaryClassifier.json
@@ -20,11 +20,6 @@
             {
                 "name": "y",
                 "type": "array"
-            },
-            {
-                "name": "features",
-                "type": "int",
-                "description": "Number of features in X"
             }
         ]
     },
@@ -67,6 +62,11 @@
                 "type": "int",
                 "default": 20
             },
+            "input_dim": {
+                "type": "int",
+                "description": "Number of features in X",
+                "default": null
+            },
             "layers": {
                 "type": "list",
                 "default": [
@@ -75,7 +75,7 @@
                         "parameters": {
                             "units": "dense_1_units",
                             "activation": "relu",
-                            "input_dim": "features"
+                            "input_dim": "input_dim"
                         }
                     },
                     {
diff --git a/mlprimitives/primitives/keras.Sequential.MLPMultiClassClassifier.json b/mlprimitives/primitives/keras.Sequential.MLPMultiClassClassifier.json
@@ -25,11 +25,6 @@
                 "name": "classes",
                 "type": "int",
                 "description": "Number of classes"
-            },
-            {
-                "name": "features",
-                "type": "int",
-                "description": "Number of features in X"
             }
         ]
     },
@@ -72,6 +67,11 @@
                 "type": "int",
                 "default": 20
             },
+            "input_dim": {
+                "type": "int",
+                "default": null,
+                "description": "Number of features in X"
+            },
             "layers": {
                 "type": "list",
                 "default": [
@@ -80,7 +80,7 @@
                         "parameters": {
                             "units": "dense_1_units",
                             "activation": "relu",
-                            "input_dim": "features"
+                            "input_dim": "input_dim"
                         }
                     },
                     {
diff --git a/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageClassifier.json b/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageClassifier.json
@@ -57,11 +57,7 @@
             },
             "input_shape": {
                 "type": "list",
-                "default": [
-                    224,
-                    224,
-                    3
-                ]
+                "default": null
             },
             "conv_filters": {
                 "type": "int",
diff --git a/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageRegressor.json b/mlprimitives/primitives/keras.Sequential.SingleLayerCNNImageRegressor.json
@@ -52,11 +52,7 @@
             },
             "input_shape": {
                 "type": "list",
-                "default": [
-                    224,
-                    224,
-                    3
-                ]
+                "default": null
             },
             "conv_filters": {
                 "type": "int",
diff --git a/mlprimitives/primitives/keras.Sequential.VGGCNNClassifier.json b/mlprimitives/primitives/keras.Sequential.VGGCNNClassifier.json
@@ -55,11 +55,7 @@
             },
             "input_shape": {
                 "type": "list",
-                "default": [
-                    224,
-                    224,
-                    3
-                ]
+                "default": null
             },
             "conv2d_2_filters": {
                 "type": "int",
diff --git a/mlprimitives/primitives/mlprimitives.custom.preprocessing.RangeScaler.json b/mlprimitives/primitives/mlprimitives.custom.preprocessing.RangeScaler.json
diff --git a/mlprimitives/primitives/mlprimitives.custom.preprocessing.RangeUnscaler.json b/mlprimitives/primitives/mlprimitives.custom.preprocessing.RangeUnscaler.json
diff --git a/mlprimitives/utils.py b/mlprimitives/utils.py
diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
diff --git a/tests/adapters/test_keras.py b/tests/adapters/test_keras.py
diff --git a/tests/custom/test_preprocessing.py b/tests/custom/test_preprocessing.py
diff --git a/tests/custom/test_text.py b/tests/custom/test_text.py
diff --git a/tests/primitives/test_primitives.py b/tests/primitives/test_primitives.py

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@`
`11`	`11`	`"primitives": [`
`12`	`12`	`"mlprimitives.custom.preprocessing.ClassEncoder",`
`13`	`13`	`"mlprimitives.custom.feature_extraction.CategoricalEncoder",`
`14`		`- "mlprimitives.custom.counters.count_features",`
`15`	`14`	`"keras.Sequential.MLPBinaryClassifier",`
`16`	`15`	`"mlprimitives.custom.preprocessing.ClassDecoder"`
`17`	`16`	`]`