Skip to content

Commit 9af5685

Browse files
authored
Merge pull request #240 from joanvaquer/issue239-window_size
Accept timedelta and string in window_size in cutoff_window_sequences
2 parents dd0570a + 7850291 commit 9af5685

File tree

2 files changed

+255
-4
lines changed

2 files changed

+255
-4
lines changed

mlprimitives/custom/timeseries_preprocessing.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,8 +213,10 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
213213
timeseries (pandas.DataFrame):
214214
``pandas.DataFrame`` containing the actual timeseries data. The time index
215215
and either be set as the DataFrame index or as a column.
216-
window_size (int):
217-
Numer of elements to take before the cutoff time for each sequence.
216+
window_size (int, str or Timedelta):
217+
If an integer is passed, it is the number of elements to take before the
218+
cutoff time for each sequence. If a string or a Timedelta object is passed,
219+
it is the period of time we take the elements from.
218220
cutoff_time (str):
219221
Optional. If given, the indicated column will be used as the cutoff time.
220222
Otherwise, the table index will be used.
@@ -237,6 +239,9 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
237239

238240
columns = list(X.columns)
239241

242+
if not isinstance(window_size, int):
243+
window_size = pd.to_timedelta(window_size)
244+
240245
output = list()
241246
for idx, row in enumerate(X.itertuples()):
242247
selected = timeseries[timeseries.index < row.Index]
@@ -246,7 +251,12 @@ def cutoff_window_sequences(X, timeseries, window_size, cutoff_time=None, time_i
246251
mask &= selected.pop(column) == getattr(row, column)
247252

248253
selected = selected[mask]
249-
selected = selected.iloc[-window_size:]
254+
255+
if not isinstance(window_size, int):
256+
min_time = selected.index[-1] - window_size
257+
selected = selected.loc[selected.index > min_time]
258+
else:
259+
selected = selected.iloc[-window_size:]
250260

251261
len_selected = len(selected)
252262
if (len_selected != window_size):

tests/custom/test_timeseries_preprocessing.py

Lines changed: 242 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from numpy.testing import assert_allclose
66

77
from mlprimitives.custom.timeseries_preprocessing import (
8-
intervals_to_mask, rolling_window_sequences, time_segments_aggregate, time_segments_average)
8+
cutoff_window_sequences, intervals_to_mask, rolling_window_sequences, time_segments_aggregate,
9+
time_segments_average)
910

1011

1112
class IntervalsToMaskTest(TestCase):
@@ -239,3 +240,243 @@ def test_multiple(self):
239240
expected_index = np.array([1, 3])
240241
self._run(X, interval, expected_values, expected_index, time_column=0,
241242
method=['mean', 'median'])
243+
244+
245+
class CutoffWindowSequencesTest(TestCase):
246+
247+
def setUp(self):
248+
self.X = pd.DataFrame({
249+
'id1': [1, 2],
250+
'cutoff': pd.to_datetime(['2020-01-05', '2020-01-07'])
251+
}).set_index('cutoff')
252+
self.timeseries = pd.DataFrame({
253+
'timestamp': list(pd.date_range(
254+
start='2020-01-01',
255+
end='2020-01-10',
256+
freq='1d'
257+
)) * 2,
258+
'value1': np.arange(1, 21),
259+
'value2': np.arange(21, 41),
260+
'id1': [1] * 10 + [2] * 10
261+
}).set_index('timestamp')
262+
263+
def test_cutoff_time_column(self):
264+
"""Passing cutoff_time. The indicated column will be used as the cutoff time."""
265+
# setup
266+
timeseries = self.timeseries
267+
X = self.X.reset_index()
268+
269+
# run
270+
array = cutoff_window_sequences(
271+
X,
272+
timeseries,
273+
window_size=3,
274+
cutoff_time='cutoff',
275+
)
276+
277+
# assert
278+
expected_array = np.array([
279+
[[2, 22],
280+
[3, 23],
281+
[4, 24]],
282+
[[14, 34],
283+
[15, 35],
284+
[16, 36]]
285+
])
286+
287+
assert_allclose(array, expected_array)
288+
289+
def test_time_index_column(self):
290+
"""Passing time_index. The indicated column will be used as the timeseries index."""
291+
# setup
292+
X = self.X
293+
timeseries = self.timeseries.reset_index()
294+
295+
# run
296+
array = cutoff_window_sequences(
297+
X,
298+
timeseries,
299+
window_size=3,
300+
time_index='timestamp',
301+
)
302+
303+
# assert
304+
expected_array = np.array([
305+
[[2, 22],
306+
[3, 23],
307+
[4, 24]],
308+
[[14, 34],
309+
[15, 35],
310+
[16, 36]]
311+
])
312+
313+
assert_allclose(array, expected_array)
314+
315+
def test_window_size_integer(self):
316+
"""window_size accepts integer."""
317+
# setup
318+
X = self.X
319+
timeseries = self.timeseries
320+
321+
# run
322+
array = cutoff_window_sequences(
323+
X,
324+
timeseries,
325+
window_size=3,
326+
)
327+
328+
# assert
329+
expected_array = np.array([
330+
[[2, 22],
331+
[3, 23],
332+
[4, 24]],
333+
[[14, 34],
334+
[15, 35],
335+
[16, 36]]
336+
])
337+
338+
assert_allclose(array, expected_array)
339+
340+
def test_window_size_string(self):
341+
"""window_size accepts string."""
342+
# setup
343+
X = self.X
344+
timeseries = self.timeseries
345+
346+
# run
347+
array = cutoff_window_sequences(
348+
X,
349+
timeseries,
350+
window_size='3d',
351+
)
352+
353+
# assert
354+
expected_array = np.array([
355+
[[2, 22],
356+
[3, 23],
357+
[4, 24]],
358+
[[14, 34],
359+
[15, 35],
360+
[16, 36]]
361+
])
362+
363+
assert_allclose(array, expected_array)
364+
365+
def test_window_size_timedelta(self):
366+
"""window_size accepts Timedelta object."""
367+
# setup
368+
X = self.X
369+
timeseries = self.timeseries
370+
371+
# run
372+
array = cutoff_window_sequences(
373+
X,
374+
timeseries,
375+
window_size=pd.Timedelta(days=3),
376+
)
377+
378+
# assert
379+
expected_array = np.array([
380+
[[2, 22],
381+
[3, 23],
382+
[4, 24]],
383+
[[14, 34],
384+
[15, 35],
385+
[16, 36]]
386+
])
387+
388+
assert_allclose(array, expected_array)
389+
390+
def test_not_enough_data(self):
391+
"""If there is not enough data for the given window_size, shape changes."""
392+
# setup
393+
X = self.X
394+
timeseries = self.timeseries
395+
396+
# run
397+
array = cutoff_window_sequences(
398+
X,
399+
timeseries,
400+
window_size=5,
401+
)
402+
403+
# assert
404+
assert len(array) == 2
405+
406+
expected_array = np.array([
407+
np.array([
408+
[1, 21],
409+
[2, 22],
410+
[3, 23],
411+
[4, 24]
412+
]),
413+
np.array([
414+
[12, 32],
415+
[13, 33],
416+
[14, 34],
417+
[15, 35],
418+
[16, 36]
419+
])
420+
])
421+
422+
assert_allclose(
423+
array[0],
424+
expected_array[0]
425+
)
426+
427+
assert_allclose(
428+
array[1],
429+
expected_array[1]
430+
)
431+
432+
def test_cutoff_time_only(self):
433+
"""Test X without any other column than cutoff_time."""
434+
# setup
435+
X = self.X
436+
del X['id1']
437+
timeseries = self.timeseries
438+
del timeseries['id1']
439+
440+
# run
441+
array = cutoff_window_sequences(
442+
X,
443+
timeseries,
444+
window_size=3,
445+
)
446+
447+
# assert
448+
expected_array = np.array([
449+
[[12, 32],
450+
[13, 33],
451+
[14, 34]],
452+
[[14, 34],
453+
[15, 35],
454+
[16, 36]]
455+
])
456+
457+
assert_allclose(array, expected_array)
458+
459+
def test_multiple_filter(self):
460+
"""Test X with two identifier columns."""
461+
# setup
462+
X = self.X
463+
X['id2'] = [3, 4]
464+
timeseries = self.timeseries
465+
timeseries['id2'] = [3, 4] * 10
466+
467+
# run
468+
array = cutoff_window_sequences(
469+
X,
470+
timeseries,
471+
window_size=2,
472+
)
473+
474+
# assert
475+
expected_array = np.array([
476+
[[1, 21],
477+
[3, 23]],
478+
[[14, 34],
479+
[16, 36]]
480+
])
481+
482+
assert_allclose(array, expected_array)

0 commit comments

Comments
 (0)