Process signals implementation (#10)

pvk-developer · web-flow · commit 1a8d0bc73907 · 2021-02-09T10:27:37.000+01:00
* Include demo_timeseries.csv with the install

* Process Signals scafold.

* Fix fft

* Implement sigpro.process_signals and integration test

* Fix tests

* Add process_signals primitive

* Fix lint

* Add sampling_frequency as int or str, if str use column from dataframe.

* process signals update.

* Add process_signal function to the __init__

* Rename process signals primitive.
diff --git a/sigpro/__init__.py b/sigpro/__init__.py
@@ -10,6 +10,8 @@
 
 from mlblocks import discovery
 
+from sigpro.process_signals import process_signals
+
 _BASE_PATH = os.path.abspath(os.path.dirname(__file__))
 MLBLOCKS_PRIMITIVES = os.path.join(_BASE_PATH, 'primitives')
 
@@ -42,3 +44,6 @@ def get_primitives(name=None, primitive_type=None, primitive_subtype=None):
         filters['classifiers.subtype'] = primitive_subtype
 
     return discovery.find_primitives(name or 'sigpro', filters)
+
+
+__all__ = ('process_signals', )
diff --git a/sigpro/demo.py b/sigpro/demo.py
@@ -11,14 +11,14 @@
 DEMO_PATH = os.path.join(os.path.dirname(__file__), 'data')
 
 
-def get_demo_data():
+def get_demo_data(nrows=None):
     """Get a demo ``pandas.DataFrame`` containing the accepted data format.
 
     Returns:
         A ``pd.DataFrame`` containing as ``values`` the signal values.
     """
     demo_path = os.path.join(DEMO_PATH, 'demo_timeseries.csv')
-    df = pd.read_csv(demo_path, parse_dates=['timestamp'])
+    df = pd.read_csv(demo_path, parse_dates=['timestamp'], nrows=nrows)
     df["values"] = df["values"].apply(json.loads).apply(list)
     return df
 
@@ -74,12 +74,12 @@ def get_frequency_demo(index=None, real=True):
     """
     amplitude_values, sampling_frequency = get_amplitude_demo(index)
     fft_values = np.fft.fft(amplitude_values)
-    frequencies = np.fft.fftfreq(len(fft_values), sampling_frequency)
+    length = len(fft_values)
+    frequencies = np.fft.fftfreq(len(fft_values), 1 / sampling_frequency)
     if real:
         fft_values = np.real(fft_values)
-        frequencies = np.real(frequencies)
 
-    return fft_values, frequencies
+    return fft_values[0:length // 2], frequencies[0:length // 2]
 
 
 def get_frequency_time_demo(index=None, real=True):
diff --git a/sigpro/primitives/sigpro.process_signals.json b/sigpro/primitives/sigpro.process_signals.json
@@ -0,0 +1,47 @@
+{
+    "name": "sigpro.process_signals",
+    "primitive": "sigpro.process_signals",
+    "classifiers": {
+        "type": "preprocessor",
+        "subtype": "feature_extractor"
+    },
+    "produce": {
+        "args": [
+            {
+                "name": "data",
+                "type": "pandas.DataFrame"
+            },
+            {
+                "name": "transformations",
+                "type": "list"
+            },
+            {
+                "name": "aggregations",
+                "type": "list"
+            },
+            {
+                "name": "keep_values",
+                "type": "bool",
+                "default": false
+            },
+            {
+                "name": "values_column_name",
+                "type": "str",
+                "default": "values"
+            }
+        ],
+        "output": []
+    },
+    "hyperparameters": {
+        "fixed": {
+            "keep_values": {
+                "type": "bool",
+                "default": false
+            },
+            "values_column_name": {
+                "type": "str",
+                "default": "values"
+            }
+        }
+    }
+}
diff --git a/sigpro/primitives/sigpro/transformations/frequency/band/frequency_band.json b/sigpro/primitives/sigpro/transformations/frequency/band/frequency_band.json
@@ -0,0 +1,41 @@
+{
+    "name": "sigpro.transformations.frequency.band.frequency_band",
+    "primitive": "sigpro.transformations.frequency.band.frequency_band",
+    "classifiers": {
+        "type": "aggregation",
+        "subtype": "frequency"
+    },
+    "produce": {
+        "args": [
+            {
+                "name": "amplitude_values",
+                "type": "numpy.ndarray"
+            },
+            {
+                "name": "frequency_values",
+                "type": "numpy.ndarray"
+            }
+        ],
+        "output": [
+            {
+                "name": "amplitude_values",
+                "type": "numpy.ndarray"
+            },
+            {
+                "name": "frequency_values",
+                "type": "numpy.ndarray"
+            }
+        ]
+    },
+    "hyperparameters": {
+        "fixed": {
+            "low": {
+                "type": "int"
+            },
+            "high": {
+                "type": "int"
+            }
+        },
+        "tunable": {}
+    }
+}
diff --git a/sigpro/process_signals.py b/sigpro/process_signals.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+"""Process Signals core functionality."""
+
+from collections import Counter
+
+import pandas as pd
+from mlblocks import MLPipeline, load_primitive
+
+
+def _build_pipeline(transformations, aggregations):
+    """Build Pipeline function.
+
+    Given a list of transformations and aggregations build a pipeline
+    with the output of the aggregations, which take as name the specified
+    name of the transformations and the aggregation. This lists are composed
+    by dictionaries with the following specification:
+
+        * ``Name``:
+            Name of the transformation / aggregation.
+        * ``primitive``:
+            Name of the primitive to apply.
+        * ``init_params``:
+            Dictionary containing the initializing parameters for the primitive.
+
+    Args:
+        transformations (list):
+            List of dictionaries containing the transformation primitives.
+        transformations (list):
+            List of dictionaries containing the aggregation primitives.
+
+    Returns:
+        mlblocks.MLPipeline:
+            An ``MLPipeline`` object that first applies all the transformations
+            and then produces as output the aggregations specified.
+    """
+    primitives = []
+    init_params = {}
+    prefix = []
+    outputs = []
+    counter = Counter()
+
+    for transformation in transformations:
+        prefix.append(transformation['name'])
+        primitive = transformation['primitive']
+        counter[primitive] += 1
+        primitive_name = f'{primitive}#{counter[primitive]}'
+        primitives.append(primitive)
+        params = transformation.get('init_params')
+        if params:
+            init_params[primitive_name] = params
+
+    prefix = '.'.join(prefix) if prefix else ''
+
+    for aggregation in aggregations:
+        aggregation_name = f'{prefix}.{aggregation["name"]}' if prefix else aggregation['name']
+
+        primitive = aggregation['primitive']
+        counter[primitive] += 1
+        primitive_name = f'{primitive}#{counter[primitive]}'
+        primitives.append(primitive)
+
+        primitive = load_primitive(primitive)
+        primitive_outputs = primitive['produce']['output']
+
+        for output in primitive_outputs:
+            output = output['name']
+            outputs.append({
+                'name': f'{aggregation_name}.{output}',
+                'variable': f'{primitive_name}.{output}'
+            })
+
+        params = aggregation.get('init_params')
+        if params:
+            init_params[primitive_name] = params
+
+    outputs = {'default': outputs} if outputs else None
+
+    return MLPipeline(
+        primitives,
+        init_params=init_params,
+        outputs=outputs
+    )
+
+
+def _apply_pipeline(row, pipeline, values_column_name):
+    """Apply a ``mlblocks.MLPipeline`` to a row.
+
+    Apply a ``MLPipeline`` to a row of a ``pd.DataFrame``, this function can
+    be combined with the ``pd.DataFrame.apply`` method to be applied to the
+    entire data frame.
+
+    Args:
+        row (pd.Series):
+            Row used to apply the pipeline to.
+        pipeline (mlblocks.MLPipeline):
+            Pipeline to be used for producing the results.
+        values_column_name (str):
+            The name of the column that contains the signal values.
+    """
+    context = row.to_dict()
+    amplitude_values = context.pop(values_column_name)
+    output = pipeline.predict(
+        amplitude_values=amplitude_values,
+        **context,
+    )
+    output_names = pipeline.get_output_names()
+
+    # ensure that we can iterate over output
+    output = output if isinstance(output, tuple) else (output, )
+
+    return pd.Series(dict(zip(output_names, output)))
+
+
+def process_signals(data, transformations, aggregations,
+                    values_column_name='values', keep_values=False):
+    """Process Signals.
+
+    The Process Signals is responsible for applying a collection of primitives specified by the
+    user in order to create features for the given data.
+
+    Given a list of transformations and aggregations which are composed
+    by dictionaries with the following specification:
+
+        * ``Name``:
+            Name of the transformation / aggregation.
+        * ``primitive``:
+            Name of the primitive to apply.
+        * ``init_params``:
+            Dictionary containing the initializing parameters for the primitive.
+
+    The process signals will build an ``mlblocks.MLPipeline`` and will generate the features
+    by previously applying the transformations and then compute the aggregations.
+
+    Args:
+        data (pandas.DataFrame):
+            Dataframe with a column that contains signal values.
+        transformations (list):
+            List of dictionaries containing the transformation primitives.
+        aggregations (list):
+            List of dictionaries containing the aggregation primitives.
+        values_column_name (str):
+            The name of the column that contains the signal values. Defaults to ``values``.
+        keep_values (bool):
+            Whether or not to keep the original signal values or remove them.
+
+    Returns:
+        pandas.DataFrame:
+            A data frame with new feature columns by applying the previous primitives. If
+            ``keep_values`` is ``True`` the original signal values will be conserved in the
+            data frame, otherwise the original signal values will be deleted.
+    """
+    pipeline = _build_pipeline(transformations, aggregations)
+    features = data.apply(
+        _apply_pipeline,
+        args=(pipeline, values_column_name),
+        axis=1
+    )
+
+    data = pd.concat([data, features], axis=1)
+
+    if not keep_values:
+        del data[values_column_name]
+
+    return data
diff --git a/sigpro/transformations/frequency/band.py b/sigpro/transformations/frequency/band.py
@@ -0,0 +1,21 @@
+"""SigPro Frequency Band module."""
+
+
+def frequency_band(amplitude_values, frequency_values, low, high):
+    """Extract a specific band.
+
+    Filter between a high and low band frequency and return the amplitude values and frequency
+    values for those.
+
+    Args:
+        amplitude_values (np.ndarray):
+            A numpy array with the signal values.
+        frequency_values (np.ndarray):
+            A numpy array with the frequency values.
+    Returns:
+        tuple:
+            * `amplitude_values (numpy.ndarray)` for the selected frequency values.
+            * `frequency_values (numpy.ndarray)` for the selected frequency values.
+    """
+    mask = (frequency_values > low) & (frequency_values < high)
+    return amplitude_values[mask], frequency_values[mask]
diff --git a/sigpro/transformations/frequency/fft.py b/sigpro/transformations/frequency/fft.py
@@ -22,9 +22,10 @@ def fft(amplitude_values, sampling_frequency):
             * `frequency_values (numpy.ndarray)`
     """
     amplitude_values = np.fft.fft(amplitude_values)
-    frequency_values = np.fft.fftfreq(len(amplitude_values), sampling_frequency)
+    frequency_values = np.fft.fftfreq(len(amplitude_values), 1 / sampling_frequency)
 
-    return amplitude_values, frequency_values
+    length = len(frequency_values) // 2
+    return amplitude_values[:length], frequency_values[:length]
 
 
 def fft_real(amplitude_values, sampling_frequency):
@@ -48,7 +49,6 @@ def fft_real(amplitude_values, sampling_frequency):
             * `amplitude_values (numpy.ndarray)`
             * `frequency_values (numpy.ndarray)`
     """
-    amplitude_values = np.real(np.fft.fft(amplitude_values))
-    frequency_values = np.fft.fftfreq(len(amplitude_values), sampling_frequency)
+    amplitude_values, frequency_values = fft(amplitude_values, sampling_frequency)
 
-    return amplitude_values, frequency_values
+    return np.real(amplitude_values), np.real(frequency_values)
diff --git a/tests/integration/test_contributing.py b/tests/integration/test_contributing.py
@@ -116,5 +116,5 @@ def test_run_primitive_aggregation_hyperparameters():
 
 def test_run_primitive_transformation():
     result = run_primitive('sigpro.transformations.frequency.fft.fft')
-    assert len(result[0]) == 400
-    assert len(result[1]) == 400
+    assert len(result[0]) == 200
+    assert len(result[1]) == 200
diff --git a/tests/integration/test_demo.py b/tests/integration/test_demo.py
@@ -31,22 +31,22 @@ def test_get_amplitude_demo_indexed():
 
 def test_get_frequency_demo_without_index():
     values, frequency_values = get_frequency_demo()
-    assert EXPECTED_VALUES_LENGTH == len(values)
-    assert EXPECTED_FREQUENCY_LENGTH == len(frequency_values)
+    assert EXPECTED_VALUES_LENGTH // 2 == len(values)
+    assert EXPECTED_FREQUENCY_LENGTH // 2 == len(frequency_values)
 
 
 def test_get_frequency_demo_indexed():
     values, frequency_values = get_frequency_demo(index=1)
-    assert EXPECTED_VALUES_LENGTH == len(values)
-    assert EXPECTED_FREQUENCY_LENGTH == len(frequency_values)
+    assert EXPECTED_VALUES_LENGTH // 2 == len(values)
+    assert EXPECTED_FREQUENCY_LENGTH // 2 == len(frequency_values)
 
 
 def test_get_frequency_demo_complex():
     values, frequency_values = get_frequency_demo(real=False)
     value = values[0]
     assert type(value) == np.complex128
-    assert EXPECTED_VALUES_LENGTH == len(values)
-    assert EXPECTED_FREQUENCY_LENGTH == len(frequency_values)
+    assert EXPECTED_VALUES_LENGTH // 2 == len(values)
+    assert EXPECTED_FREQUENCY_LENGTH // 2 == len(frequency_values)
 
 
 def test_get_frequency_time_demo_without_index():
diff --git a/tests/integration/test_process_signals.py b/tests/integration/test_process_signals.py
diff --git a/tests/unit/test_process_signals.py b/tests/unit/test_process_signals.py
diff --git a/tests/unit/transformations/frequency/test_fft.py b/tests/unit/transformations/frequency/test_fft.py