Merge pull request #240 from joanvaquer/issue239-window_size

csala · web-flow · commit 9af568515220 · 2020-07-28T15:41:15.000+02:00
Accept timedelta and string in window_size in cutoff_window_sequences
diff --git a/mlprimitives/custom/timeseries_preprocessing.py b/mlprimitives/custom/timeseries_preprocessing.py
@@ -213,8 +213,10 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
         timeseries (pandas.DataFrame):
             ``pandas.DataFrame`` containing the actual timeseries data. The time index
             and either be set as the DataFrame index or as a column.
-        window_size (int):
-            Numer of elements to take before the cutoff time for each sequence.
+        window_size (int, str or Timedelta):
+            If an integer is passed, it is the number of elements to take before the
+            cutoff time for each sequence. If a string or a Timedelta object is passed,
+            it is the period of time we take the elements from.
         cutoff_time (str):
             Optional. If given, the indicated column will be used as the cutoff time.
             Otherwise, the table index will be used.
@@ -237,6 +239,9 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
 
     columns = list(X.columns)
 
+    if not isinstance(window_size, int):
+        window_size = pd.to_timedelta(window_size)
+
     output = list()
     for idx, row in enumerate(X.itertuples()):
         selected = timeseries[timeseries.index < row.Index]
@@ -246,7 +251,12 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
             mask &= selected.pop(column) == getattr(row, column)
 
         selected = selected[mask]
-        selected = selected.iloc[-window_size:]
+
+        if not isinstance(window_size, int):
+            min_time = selected.index[-1] - window_size
+            selected = selected.loc[selected.index > min_time]
+        else:
+            selected = selected.iloc[-window_size:]
 
         len_selected = len(selected)
         if (len_selected != window_size):
diff --git a/tests/custom/test_timeseries_preprocessing.py b/tests/custom/test_timeseries_preprocessing.py
@@ -5,7 +5,8 @@
 from numpy.testing import assert_allclose
 
 from mlprimitives.custom.timeseries_preprocessing import (
-    intervals_to_mask, rolling_window_sequences, time_segments_aggregate, time_segments_average)
+    cutoff_window_sequences, intervals_to_mask, rolling_window_sequences, time_segments_aggregate,
+    time_segments_average)
 
 
 class IntervalsToMaskTest(TestCase):
@@ -239,3 +240,243 @@ def test_multiple(self):
         expected_index = np.array([1, 3])
         self._run(X, interval, expected_values, expected_index, time_column=0,
                   method=['mean', 'median'])
+
+
+class CutoffWindowSequencesTest(TestCase):
+
+    def setUp(self):
+        self.X = pd.DataFrame({
+            'id1': [1, 2],
+            'cutoff': pd.to_datetime(['2020-01-05', '2020-01-07'])
+        }).set_index('cutoff')
+        self.timeseries = pd.DataFrame({
+            'timestamp': list(pd.date_range(
+                start='2020-01-01',
+                end='2020-01-10',
+                freq='1d'
+            )) * 2,
+            'value1': np.arange(1, 21),
+            'value2': np.arange(21, 41),
+            'id1': [1] * 10 + [2] * 10
+        }).set_index('timestamp')
+
+    def test_cutoff_time_column(self):
+        """Passing cutoff_time. The indicated column will be used as the cutoff time."""
+        # setup
+        timeseries = self.timeseries
+        X = self.X.reset_index()
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=3,
+            cutoff_time='cutoff',
+        )
+
+        # assert
+        expected_array = np.array([
+            [[2, 22],
+             [3, 23],
+             [4, 24]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_time_index_column(self):
+        """Passing time_index. The indicated column will be used as the timeseries index."""
+        # setup
+        X = self.X
+        timeseries = self.timeseries.reset_index()
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=3,
+            time_index='timestamp',
+        )
+
+        # assert
+        expected_array = np.array([
+            [[2, 22],
+             [3, 23],
+             [4, 24]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_window_size_integer(self):
+        """window_size accepts integer."""
+        # setup
+        X = self.X
+        timeseries = self.timeseries
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=3,
+        )
+
+        # assert
+        expected_array = np.array([
+            [[2, 22],
+             [3, 23],
+             [4, 24]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_window_size_string(self):
+        """window_size accepts string."""
+        # setup
+        X = self.X
+        timeseries = self.timeseries
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size='3d',
+        )
+
+        # assert
+        expected_array = np.array([
+            [[2, 22],
+             [3, 23],
+             [4, 24]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_window_size_timedelta(self):
+        """window_size accepts Timedelta object."""
+        # setup
+        X = self.X
+        timeseries = self.timeseries
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=pd.Timedelta(days=3),
+        )
+
+        # assert
+        expected_array = np.array([
+            [[2, 22],
+             [3, 23],
+             [4, 24]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_not_enough_data(self):
+        """If there is not enough data for the given window_size, shape changes."""
+        # setup
+        X = self.X
+        timeseries = self.timeseries
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=5,
+        )
+
+        # assert
+        assert len(array) == 2
+
+        expected_array = np.array([
+            np.array([
+                [1, 21],
+                [2, 22],
+                [3, 23],
+                [4, 24]
+            ]),
+            np.array([
+                [12, 32],
+                [13, 33],
+                [14, 34],
+                [15, 35],
+                [16, 36]
+            ])
+        ])
+
+        assert_allclose(
+            array[0],
+            expected_array[0]
+        )
+
+        assert_allclose(
+            array[1],
+            expected_array[1]
+        )
+
+    def test_cutoff_time_only(self):
+        """Test X without any other column than cutoff_time."""
+        # setup
+        X = self.X
+        del X['id1']
+        timeseries = self.timeseries
+        del timeseries['id1']
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=3,
+        )
+
+        # assert
+        expected_array = np.array([
+            [[12, 32],
+             [13, 33],
+             [14, 34]],
+            [[14, 34],
+             [15, 35],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)
+
+    def test_multiple_filter(self):
+        """Test X with two identifier columns."""
+        # setup
+        X = self.X
+        X['id2'] = [3, 4]
+        timeseries = self.timeseries
+        timeseries['id2'] = [3, 4] * 10
+
+        # run
+        array = cutoff_window_sequences(
+            X,
+            timeseries,
+            window_size=2,
+        )
+
+        # assert
+        expected_array = np.array([
+            [[1, 21],
+             [3, 23]],
+            [[14, 34],
+             [16, 36]]
+        ])
+
+        assert_allclose(array, expected_array)