|
8 | 8 | import pandas as pd |
9 | 9 | import pytest |
10 | 10 |
|
11 | | -from darts import TimeSeries, concatenate |
| 11 | +from darts import TimeSeries, concatenate, to_group_dataframe |
12 | 12 | from darts.dataprocessing.transformers import BoxCox, Scaler |
13 | 13 | from darts.tests.conftest import POLARS_AVAILABLE |
14 | 14 | from darts.timeseries import ( |
@@ -135,6 +135,137 @@ def test_ts_from_x(self, tag, tmpdir_module): |
135 | 135 | # Test without kwargs (new automatic serialization) |
136 | 136 | self.helper_test_transfer(tag, ts, TimeSeries.from_json(ts_json)) |
137 | 137 |
|
| 138 | + @pytest.mark.parametrize("backend", TEST_BACKENDS) |
| 139 | + def test_to_dataframe_add_static_covariates(self, backend): |
| 140 | + """Tests adding global as well as component specific static covariates to dataframe.""" |
| 141 | + df = pd.DataFrame({ |
| 142 | + "time": [0, 1, 2], |
| 143 | + "a": [1.0, 2.0, 3.0], |
| 144 | + "b": [4.0, 5.0, 6.0], |
| 145 | + }) |
| 146 | + df = self.pd_to_backend(df, backend) |
| 147 | + static_covs = pd.DataFrame( |
| 148 | + {"sc1": ["a"], "sc2": ["b"]}, index=["global_components"] |
| 149 | + ) |
| 150 | + series = TimeSeries.from_dataframe( |
| 151 | + df, |
| 152 | + time_col="time", |
| 153 | + static_covariates=static_covs, |
| 154 | + ) |
| 155 | + assert series.static_covariates.equals(static_covs) |
| 156 | + |
| 157 | + # sanity check that by default no static covs are added |
| 158 | + kwargs = {"backend": backend, "time_as_index": False} |
| 159 | + assert series.to_dataframe(**kwargs).equals(df) |
| 160 | + |
| 161 | + # adding all static covariates |
| 162 | + df_out = nw.from_native( |
| 163 | + series.to_dataframe(**kwargs, add_static_covariates=True) |
| 164 | + ).to_pandas() |
| 165 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"] |
| 166 | + assert (df_out[["sc1", "sc2"]] == static_covs.values).all().all() |
| 167 | + |
| 168 | + # adding a single column as string |
| 169 | + df_out = nw.from_native( |
| 170 | + series.to_dataframe(**kwargs, add_static_covariates="sc2") |
| 171 | + ).to_pandas() |
| 172 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc2"] |
| 173 | + assert (df_out[["sc2"]] == static_covs[["sc2"]].values).all().all() |
| 174 | + |
| 175 | + # adding a list of columns with different order |
| 176 | + df_out = nw.from_native( |
| 177 | + series.to_dataframe(**kwargs, add_static_covariates=["sc2", "sc1"]) |
| 178 | + ).to_pandas() |
| 179 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"] |
| 180 | + assert ( |
| 181 | + (df_out[["sc2", "sc1"]] == static_covs[["sc2", "sc1"]].values).all().all() |
| 182 | + ) |
| 183 | + |
| 184 | + with pytest.raises(ValueError, match="`add_static_covariates` do not exist"): |
| 185 | + _ = series.to_dataframe( |
| 186 | + **kwargs, add_static_covariates=["does_not_exist", "sc1"] |
| 187 | + ) |
| 188 | + |
| 189 | + # component specific static covariates |
| 190 | + static_covs = pd.DataFrame( |
| 191 | + {"sc1": ["aa", "ab"], "sc2": ["ba", "bb"]}, index=["a", "b"] |
| 192 | + ) |
| 193 | + series = TimeSeries.from_dataframe( |
| 194 | + df, |
| 195 | + time_col="time", |
| 196 | + static_covariates=static_covs, |
| 197 | + ) |
| 198 | + assert series.static_covariates.equals(static_covs) |
| 199 | + df_out = nw.from_native( |
| 200 | + series.to_dataframe(**kwargs, add_static_covariates=True) |
| 201 | + ).to_pandas() |
| 202 | + assert df_out.columns.tolist() == [ |
| 203 | + "time", |
| 204 | + "a", |
| 205 | + "b", |
| 206 | + "sc1_a", |
| 207 | + "sc1_b", |
| 208 | + "sc2_a", |
| 209 | + "sc2_b", |
| 210 | + ] |
| 211 | + assert ( |
| 212 | + ( |
| 213 | + df_out[["sc1_a", "sc1_b", "sc2_a", "sc2_b"]] |
| 214 | + == static_covs.values.flatten(order="K") |
| 215 | + ) |
| 216 | + .all() |
| 217 | + .all() |
| 218 | + ) |
| 219 | + |
| 220 | + @pytest.mark.parametrize("backend", TEST_BACKENDS) |
| 221 | + def test_to_dataframe_add_metadata(self, backend): |
| 222 | + """Tests adding metadata to dataframe.""" |
| 223 | + df = pd.DataFrame({ |
| 224 | + "time": [0, 1, 2], |
| 225 | + "a": [1.0, 2.0, 3.0], |
| 226 | + "b": [4.0, 5.0, 6.0], |
| 227 | + }) |
| 228 | + df = self.pd_to_backend(df, backend) |
| 229 | + metadata = {"sc1": "a", "sc2": "b"} |
| 230 | + series = TimeSeries.from_dataframe( |
| 231 | + df, |
| 232 | + time_col="time", |
| 233 | + metadata=metadata, |
| 234 | + ) |
| 235 | + assert series.metadata == metadata |
| 236 | + |
| 237 | + # sanity check that by default no metadata are added |
| 238 | + kwargs = {"backend": backend, "time_as_index": False} |
| 239 | + assert series.to_dataframe(**kwargs).equals(df) |
| 240 | + |
| 241 | + # adding all metadata |
| 242 | + df_out = nw.from_native( |
| 243 | + series.to_dataframe(**kwargs, add_metadata=True) |
| 244 | + ).to_pandas() |
| 245 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"] |
| 246 | + assert (df_out[["sc1", "sc2"]] == metadata.values()).all().all() |
| 247 | + |
| 248 | + # adding a single column as string |
| 249 | + df_out = nw.from_native( |
| 250 | + series.to_dataframe(**kwargs, add_metadata="sc2") |
| 251 | + ).to_pandas() |
| 252 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc2"] |
| 253 | + assert (df_out[["sc2"]] == metadata["sc2"]).all().all() |
| 254 | + |
| 255 | + # adding a list of columns with different order |
| 256 | + df_out = nw.from_native( |
| 257 | + series.to_dataframe(**kwargs, add_metadata=["sc2", "sc1"]) |
| 258 | + ).to_pandas() |
| 259 | + assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"] |
| 260 | + assert ( |
| 261 | + (df_out[["sc2", "sc1"]] == [metadata[key] for key in ["sc2", "sc1"]]) |
| 262 | + .all() |
| 263 | + .all() |
| 264 | + ) |
| 265 | + |
| 266 | + with pytest.raises(ValueError, match="`add_metadata` do not exist"): |
| 267 | + _ = series.to_dataframe(**kwargs, add_metadata=["does_not_exist", "sc1"]) |
| 268 | + |
138 | 269 | def test_invalid_metadata(self): |
139 | 270 | ts = linear_timeseries(length=10) |
140 | 271 | with pytest.raises(ValueError) as exc: |
@@ -180,6 +311,191 @@ def test_from_group_dataframe(self, config): |
180 | 311 | assert (ts[0].values().flatten() == [values[2], values[1], values[0]]).all() |
181 | 312 | assert (ts[1].values().flatten() == [values[3], values[4], values[5]]).all() |
182 | 313 |
|
| 314 | + @pytest.mark.parametrize( |
| 315 | + "backend,time_as_index", |
| 316 | + itertools.product( |
| 317 | + TEST_BACKENDS, |
| 318 | + [True, False], |
| 319 | + ), |
| 320 | + ) |
| 321 | + def test_to_group_dataframe_creation(self, backend, time_as_index): |
| 322 | + df_pd = pd.DataFrame( |
| 323 | + data={ |
| 324 | + "time": pd.date_range(start="2023-01-01", periods=10, freq="D"), |
| 325 | + "value": [float(i) for i in range(10)], |
| 326 | + "ID": [0.0] * 5 + [1.0] * 5, |
| 327 | + } |
| 328 | + ) |
| 329 | + df = self.pd_to_backend(df_pd, backend) |
| 330 | + series = TimeSeries.from_group_dataframe( |
| 331 | + df, |
| 332 | + time_col="time", |
| 333 | + group_cols="ID", |
| 334 | + value_cols="value", |
| 335 | + ) |
| 336 | + |
| 337 | + if time_as_index and backend == "polars": |
| 338 | + time_as_index = False |
| 339 | + elif time_as_index: |
| 340 | + df_pd = df_pd.set_index("time") |
| 341 | + |
| 342 | + reconstructed = to_group_dataframe( |
| 343 | + series, |
| 344 | + add_static_covariates=True, |
| 345 | + backend=backend, |
| 346 | + time_as_index=time_as_index, |
| 347 | + ) |
| 348 | + reconstructed_pd = nw.from_native(reconstructed).to_pandas() |
| 349 | + expected = df_pd.sort_values(["ID", "time"]) |
| 350 | + reconstructed_pd = reconstructed_pd.sort_values(["ID", "time"]) |
| 351 | + assert reconstructed_pd.equals(expected) |
| 352 | + |
| 353 | + @pytest.mark.parametrize( |
| 354 | + "add_metadata,metadata_cols", |
| 355 | + [ |
| 356 | + (True, ["source", "version", "created"]), |
| 357 | + (["source", "version", "created"], ["source", "version", "created"]), |
| 358 | + ("source", "source"), |
| 359 | + ], |
| 360 | + ) |
| 361 | + def test_to_group_dataframe_metadata_support(self, add_metadata, metadata_cols): |
| 362 | + df = pd.DataFrame({ |
| 363 | + "value": [float(i) for i in range(10)] * 2, |
| 364 | + "ID": [0.0] * 10 + [1.0] * 10, |
| 365 | + }) |
| 366 | + metadata = { |
| 367 | + "source": "test_data", |
| 368 | + "version": "1.0", |
| 369 | + "created": "2025-01-09", |
| 370 | + } |
| 371 | + for k, v in metadata.items(): |
| 372 | + df[k] = v |
| 373 | + |
| 374 | + if add_metadata is True: |
| 375 | + cols_to_include = ["value", "ID"] + list(metadata.keys()) |
| 376 | + else: |
| 377 | + if isinstance(metadata_cols, str): |
| 378 | + metadata_cols = [metadata_cols] |
| 379 | + cols_to_include = ["value", "ID"] + metadata_cols |
| 380 | + |
| 381 | + df_subset = df[cols_to_include] |
| 382 | + |
| 383 | + ts_list = TimeSeries.from_group_dataframe( |
| 384 | + df_subset, group_cols="ID", metadata_cols=metadata_cols |
| 385 | + ) |
| 386 | + reconstructed = to_group_dataframe( |
| 387 | + ts_list, add_static_covariates=True, add_metadata=add_metadata |
| 388 | + ) |
| 389 | + |
| 390 | + expected = df_subset.sort_values("ID") |
| 391 | + reconstructed = reconstructed.sort_values("ID") |
| 392 | + assert reconstructed.equals(expected) |
| 393 | + |
| 394 | + @pytest.mark.parametrize( |
| 395 | + "add_static_cov,expected_cols", |
| 396 | + [ |
| 397 | + (True, ["split", "set"]), |
| 398 | + (["split", "set", "ID"], ["split", "set"]), |
| 399 | + (["set", "ID"], "set"), |
| 400 | + ], |
| 401 | + ) |
| 402 | + def test_to_group_dataframe_global_static_cov_support( |
| 403 | + self, add_static_cov, expected_cols |
| 404 | + ): |
| 405 | + df = pd.DataFrame({ |
| 406 | + "value": [float(i) for i in range(10)] * 3, |
| 407 | + "ID": [0.0] * 10 + [1.0] * 10 + [2.0] * 10, |
| 408 | + "split": ["test"] * 10 + ["train"] * 20, |
| 409 | + "set": ["B"] * 20 + ["A"] * 10, |
| 410 | + }) |
| 411 | + expected_cols = ( |
| 412 | + [expected_cols] if isinstance(expected_cols, str) else expected_cols |
| 413 | + ) |
| 414 | + df_subset = df[["value", "ID", *expected_cols]] |
| 415 | + ts_list = TimeSeries.from_group_dataframe( |
| 416 | + df_subset, group_cols="ID", static_cols=expected_cols |
| 417 | + ) |
| 418 | + reconstructed = to_group_dataframe( |
| 419 | + ts_list, add_static_covariates=add_static_cov, add_metadata=True |
| 420 | + ) |
| 421 | + expected = df_subset.sort_values(["ID"]) |
| 422 | + reconstructed = reconstructed.sort_values(["ID"]) |
| 423 | + reconstructed = reconstructed[expected.columns] |
| 424 | + assert reconstructed.equals(expected) |
| 425 | + |
| 426 | + def test_to_group_dataframe_component_static_cov_support(self): |
| 427 | + df = pd.DataFrame({ |
| 428 | + "time": pd.date_range("2023-01-01", periods=3, freq="D").tolist() * 2, |
| 429 | + "value1": [1.0, 2.0, 3.0] * 2, |
| 430 | + "value2": [4.0, 5.0, 6.0] * 2, |
| 431 | + "ID": [0.0] * 3 + [1.0] * 3, |
| 432 | + }) |
| 433 | + static_covs = pd.DataFrame( |
| 434 | + { |
| 435 | + "source": ["sensor_A", "sensor_B"], |
| 436 | + "region": ["EU", "US"], |
| 437 | + }, |
| 438 | + index=[0, 1], |
| 439 | + ) |
| 440 | + |
| 441 | + ts_list = [ |
| 442 | + ts.with_static_covariates(static_covs) |
| 443 | + for ts in TimeSeries.from_group_dataframe( |
| 444 | + df, |
| 445 | + group_cols="ID", |
| 446 | + value_cols=["value1", "value2"], |
| 447 | + time_col="time", |
| 448 | + ) |
| 449 | + ] |
| 450 | + |
| 451 | + reconstructed = to_group_dataframe( |
| 452 | + ts_list, |
| 453 | + add_static_covariates=True, |
| 454 | + add_metadata=False, |
| 455 | + time_as_index=False, |
| 456 | + ) |
| 457 | + expected_cols = [ |
| 458 | + "source_value1", |
| 459 | + "source_value2", |
| 460 | + "region_value1", |
| 461 | + "region_value2", |
| 462 | + ] |
| 463 | + assert all(col in reconstructed.columns for col in expected_cols) |
| 464 | + |
| 465 | + assert (reconstructed["source_value1"] == static_covs["source"].loc[0]).all() |
| 466 | + assert (reconstructed["source_value2"] == static_covs["source"].loc[1]).all() |
| 467 | + assert (reconstructed["region_value1"] == static_covs["region"].loc[0]).all() |
| 468 | + assert (reconstructed["region_value2"] == static_covs["region"].loc[1]).all() |
| 469 | + |
| 470 | + @pytest.mark.parametrize("add_group_col", [True, "added_group_col"]) |
| 471 | + def test_to_group_dataframe_add_group_col(self, add_group_col): |
| 472 | + df = pd.DataFrame({ |
| 473 | + "time": pd.date_range("2023-01-01", periods=10, freq="D"), |
| 474 | + "value": np.arange(10).astype("float"), |
| 475 | + "ID": [0] * 5 + [1] * 5, |
| 476 | + }) |
| 477 | + |
| 478 | + ts = TimeSeries.from_group_dataframe( |
| 479 | + df, |
| 480 | + group_cols="ID", |
| 481 | + value_cols="value", |
| 482 | + time_col="time", |
| 483 | + ) |
| 484 | + |
| 485 | + reconstructed = to_group_dataframe( |
| 486 | + ts, |
| 487 | + add_static_covariates=False, |
| 488 | + add_metadata=False, |
| 489 | + add_group_col=add_group_col, |
| 490 | + time_as_index=False, |
| 491 | + ) |
| 492 | + if not isinstance(add_group_col, str): |
| 493 | + add_group_col = "group" |
| 494 | + |
| 495 | + assert "ID" not in reconstructed.columns |
| 496 | + reconstructed = reconstructed.rename(columns={add_group_col: "ID"})[df.columns] |
| 497 | + assert reconstructed.equals(df) |
| 498 | + |
183 | 499 | @pytest.mark.parametrize("backend", TEST_BACKENDS) |
184 | 500 | def test_timeseries_from_longitudinal_df(self, backend): |
185 | 501 | # univariate static covs: only group by "st1", keep static covs "st1" |
|
0 commit comments