|
5 | 5 | from numpy.testing import assert_allclose |
6 | 6 |
|
7 | 7 | from mlprimitives.custom.timeseries_preprocessing import ( |
8 | | - intervals_to_mask, rolling_window_sequences, time_segments_aggregate, time_segments_average) |
| 8 | + cutoff_window_sequences, intervals_to_mask, rolling_window_sequences, time_segments_aggregate, |
| 9 | + time_segments_average) |
9 | 10 |
|
10 | 11 |
|
11 | 12 | class IntervalsToMaskTest(TestCase): |
@@ -239,3 +240,243 @@ def test_multiple(self): |
239 | 240 | expected_index = np.array([1, 3]) |
240 | 241 | self._run(X, interval, expected_values, expected_index, time_column=0, |
241 | 242 | method=['mean', 'median']) |
| 243 | + |
| 244 | + |
| 245 | +class CutoffWindowSequencesTest(TestCase): |
| 246 | + |
| 247 | + def setUp(self): |
| 248 | + self.X = pd.DataFrame({ |
| 249 | + 'id1': [1, 2], |
| 250 | + 'cutoff': pd.to_datetime(['2020-01-05', '2020-01-07']) |
| 251 | + }).set_index('cutoff') |
| 252 | + self.timeseries = pd.DataFrame({ |
| 253 | + 'timestamp': list(pd.date_range( |
| 254 | + start='2020-01-01', |
| 255 | + end='2020-01-10', |
| 256 | + freq='1d' |
| 257 | + )) * 2, |
| 258 | + 'value1': np.arange(1, 21), |
| 259 | + 'value2': np.arange(21, 41), |
| 260 | + 'id1': [1] * 10 + [2] * 10 |
| 261 | + }).set_index('timestamp') |
| 262 | + |
| 263 | + def test_cutoff_time_column(self): |
| 264 | + """Passing cutoff_time. The indicated column will be used as the cutoff time.""" |
| 265 | + # setup |
| 266 | + timeseries = self.timeseries |
| 267 | + X = self.X.reset_index() |
| 268 | + |
| 269 | + # run |
| 270 | + array = cutoff_window_sequences( |
| 271 | + X, |
| 272 | + timeseries, |
| 273 | + window_size=3, |
| 274 | + cutoff_time='cutoff', |
| 275 | + ) |
| 276 | + |
| 277 | + # assert |
| 278 | + expected_array = np.array([ |
| 279 | + [[2, 22], |
| 280 | + [3, 23], |
| 281 | + [4, 24]], |
| 282 | + [[14, 34], |
| 283 | + [15, 35], |
| 284 | + [16, 36]] |
| 285 | + ]) |
| 286 | + |
| 287 | + assert_allclose(array, expected_array) |
| 288 | + |
| 289 | + def test_time_index_column(self): |
| 290 | + """Passing time_index. The indicated column will be used as the timeseries index.""" |
| 291 | + # setup |
| 292 | + X = self.X |
| 293 | + timeseries = self.timeseries.reset_index() |
| 294 | + |
| 295 | + # run |
| 296 | + array = cutoff_window_sequences( |
| 297 | + X, |
| 298 | + timeseries, |
| 299 | + window_size=3, |
| 300 | + time_index='timestamp', |
| 301 | + ) |
| 302 | + |
| 303 | + # assert |
| 304 | + expected_array = np.array([ |
| 305 | + [[2, 22], |
| 306 | + [3, 23], |
| 307 | + [4, 24]], |
| 308 | + [[14, 34], |
| 309 | + [15, 35], |
| 310 | + [16, 36]] |
| 311 | + ]) |
| 312 | + |
| 313 | + assert_allclose(array, expected_array) |
| 314 | + |
| 315 | + def test_window_size_integer(self): |
| 316 | + """window_size accepts integer.""" |
| 317 | + # setup |
| 318 | + X = self.X |
| 319 | + timeseries = self.timeseries |
| 320 | + |
| 321 | + # run |
| 322 | + array = cutoff_window_sequences( |
| 323 | + X, |
| 324 | + timeseries, |
| 325 | + window_size=3, |
| 326 | + ) |
| 327 | + |
| 328 | + # assert |
| 329 | + expected_array = np.array([ |
| 330 | + [[2, 22], |
| 331 | + [3, 23], |
| 332 | + [4, 24]], |
| 333 | + [[14, 34], |
| 334 | + [15, 35], |
| 335 | + [16, 36]] |
| 336 | + ]) |
| 337 | + |
| 338 | + assert_allclose(array, expected_array) |
| 339 | + |
| 340 | + def test_window_size_string(self): |
| 341 | + """window_size accepts string.""" |
| 342 | + # setup |
| 343 | + X = self.X |
| 344 | + timeseries = self.timeseries |
| 345 | + |
| 346 | + # run |
| 347 | + array = cutoff_window_sequences( |
| 348 | + X, |
| 349 | + timeseries, |
| 350 | + window_size='3d', |
| 351 | + ) |
| 352 | + |
| 353 | + # assert |
| 354 | + expected_array = np.array([ |
| 355 | + [[2, 22], |
| 356 | + [3, 23], |
| 357 | + [4, 24]], |
| 358 | + [[14, 34], |
| 359 | + [15, 35], |
| 360 | + [16, 36]] |
| 361 | + ]) |
| 362 | + |
| 363 | + assert_allclose(array, expected_array) |
| 364 | + |
| 365 | + def test_window_size_timedelta(self): |
| 366 | + """window_size accepts Timedelta object.""" |
| 367 | + # setup |
| 368 | + X = self.X |
| 369 | + timeseries = self.timeseries |
| 370 | + |
| 371 | + # run |
| 372 | + array = cutoff_window_sequences( |
| 373 | + X, |
| 374 | + timeseries, |
| 375 | + window_size=pd.Timedelta(days=3), |
| 376 | + ) |
| 377 | + |
| 378 | + # assert |
| 379 | + expected_array = np.array([ |
| 380 | + [[2, 22], |
| 381 | + [3, 23], |
| 382 | + [4, 24]], |
| 383 | + [[14, 34], |
| 384 | + [15, 35], |
| 385 | + [16, 36]] |
| 386 | + ]) |
| 387 | + |
| 388 | + assert_allclose(array, expected_array) |
| 389 | + |
| 390 | + def test_not_enough_data(self): |
| 391 | + """If there is not enough data for the given window_size, shape changes.""" |
| 392 | + # setup |
| 393 | + X = self.X |
| 394 | + timeseries = self.timeseries |
| 395 | + |
| 396 | + # run |
| 397 | + array = cutoff_window_sequences( |
| 398 | + X, |
| 399 | + timeseries, |
| 400 | + window_size=5, |
| 401 | + ) |
| 402 | + |
| 403 | + # assert |
| 404 | + assert len(array) == 2 |
| 405 | + |
| 406 | + expected_array = np.array([ |
| 407 | + np.array([ |
| 408 | + [1, 21], |
| 409 | + [2, 22], |
| 410 | + [3, 23], |
| 411 | + [4, 24] |
| 412 | + ]), |
| 413 | + np.array([ |
| 414 | + [12, 32], |
| 415 | + [13, 33], |
| 416 | + [14, 34], |
| 417 | + [15, 35], |
| 418 | + [16, 36] |
| 419 | + ]) |
| 420 | + ]) |
| 421 | + |
| 422 | + assert_allclose( |
| 423 | + array[0], |
| 424 | + expected_array[0] |
| 425 | + ) |
| 426 | + |
| 427 | + assert_allclose( |
| 428 | + array[1], |
| 429 | + expected_array[1] |
| 430 | + ) |
| 431 | + |
| 432 | + def test_cutoff_time_only(self): |
| 433 | + """Test X without any other column than cutoff_time.""" |
| 434 | + # setup |
| 435 | + X = self.X |
| 436 | + del X['id1'] |
| 437 | + timeseries = self.timeseries |
| 438 | + del timeseries['id1'] |
| 439 | + |
| 440 | + # run |
| 441 | + array = cutoff_window_sequences( |
| 442 | + X, |
| 443 | + timeseries, |
| 444 | + window_size=3, |
| 445 | + ) |
| 446 | + |
| 447 | + # assert |
| 448 | + expected_array = np.array([ |
| 449 | + [[12, 32], |
| 450 | + [13, 33], |
| 451 | + [14, 34]], |
| 452 | + [[14, 34], |
| 453 | + [15, 35], |
| 454 | + [16, 36]] |
| 455 | + ]) |
| 456 | + |
| 457 | + assert_allclose(array, expected_array) |
| 458 | + |
| 459 | + def test_multiple_filter(self): |
| 460 | + """Test X with two identifier columns.""" |
| 461 | + # setup |
| 462 | + X = self.X |
| 463 | + X['id2'] = [3, 4] |
| 464 | + timeseries = self.timeseries |
| 465 | + timeseries['id2'] = [3, 4] * 10 |
| 466 | + |
| 467 | + # run |
| 468 | + array = cutoff_window_sequences( |
| 469 | + X, |
| 470 | + timeseries, |
| 471 | + window_size=2, |
| 472 | + ) |
| 473 | + |
| 474 | + # assert |
| 475 | + expected_array = np.array([ |
| 476 | + [[1, 21], |
| 477 | + [3, 23]], |
| 478 | + [[14, 34], |
| 479 | + [16, 36]] |
| 480 | + ]) |
| 481 | + |
| 482 | + assert_allclose(array, expected_array) |
0 commit comments