Skip to content

Commit d3a57dd

Browse files
Jaco-Pastoriusjakubchlapekdennisbader
authored
Feat/add to group dataframe() function (#3000)
* Initial commit * Datasets deleted * test modif * Set default add_static_cov to False * Updated CHANGELOG.md and added docstring in to_group_dataframe() * modified CHANGELOG.md * reverted changes in example 15 * Feedback implementation, code coverage fixed, add of add_group_col argument * error clarification * docstring clarification * Added condition for custom group column name * minor updates --------- Co-authored-by: Jakub Chłapek <147340544+jakubchlapek@users.noreply.github.com> Co-authored-by: dennisbader <dennis.bader@gmx.ch>
1 parent d92bf93 commit d3a57dd

File tree

4 files changed

+492
-16
lines changed

4 files changed

+492
-16
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
2828
- Includes automatic downsampling for large series (configurable via `downsample_threshold` parameter) to avoid crashes when plotting large series
2929
- Integrates seamlessly with `plotting.use_darts_style` which now affects both `TimeSeries.plot()` and `TimeSeries.plotly()`
3030
- Plotly remains an optional dependency and can be installed with `pip install plotly`
31+
- `TimeSeries.to_dataframe()` now supports fine-grained control over adding the time series' metadata and static covariates to the output DataFrame with the new parameters `add_static_cov` and `add_metadata`. [#2965](https://github.com/unit8co/darts/issues/2965) by [Gabriel Margaria](https://github.com/Jaco-Pastorius).
32+
- Added method `TimeSeries.to_group_dataframe()` to convert a list of time series into a long DataFrame (pandas, polars, ...) - e.g. into the DataFrame format that is used as input for `from_group_dataframe()`. [#2965](https://github.com/unit8co/darts/issues/2965) by [Gabriel Margaria](https://github.com/Jaco-Pastorius).
3133

3234
**Fixed**
3335

@@ -41,9 +43,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
4143
- We set an upper version cap on `pandas<3.0.0` until we officially support it. [#2995](https://github.com/unit8co/darts/pull/2995) by [Dennis Bader](https://github.com/dennisbader).
4244

4345
### For developers of the library:
44-
- Removed unit tests from the package distribution (PyPI and conda-forge) to reduce package size. [#2979](https://github.com/unit8co/darts/pull/2979) and [u8darts-feedstock#59](https://github.com/conda-forge/u8darts-feedstock/pull/59) by [Zhihao Dai](https://github.com/daidahao).
4546

4647
- Migrated the dependency management and tooling to [uv](https://docs.astral.sh/uv/). Use `uv sync --group dev-all` to setup your development environment; See the [contribution guide](https://github.com/unit8co/darts/blob/master/CONTRIBUTING.md) for further information. [#2993](https://github.com/unit8co/darts/pull/2993) by [Jules Authier](https://github.com/authierj).
48+
- Removed unit tests from the package distribution (PyPI and conda-forge) to reduce package size. [#2979](https://github.com/unit8co/darts/pull/2979) and [u8darts-feedstock#59](https://github.com/conda-forge/u8darts-feedstock/pull/59) by [Zhihao Dai](https://github.com/daidahao).
4749

4850
## [0.40.0](https://github.com/unit8co/darts/tree/0.40.0) (2025-12-23)
4951

darts/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,20 @@
1212
reset_option,
1313
set_option,
1414
)
15-
from darts.timeseries import TimeSeries, concatenate, slice_intersect
15+
from darts.timeseries import (
16+
TimeSeries,
17+
concatenate,
18+
slice_intersect,
19+
to_group_dataframe,
20+
)
1621

1722
__version__ = "0.40.0"
1823

1924
__all__ = [
2025
"TimeSeries",
2126
"concatenate",
2227
"slice_intersect",
28+
"to_group_dataframe",
2329
"get_option",
2430
"set_option",
2531
"reset_option",

darts/tests/test_timeseries_static_covariates.py

Lines changed: 317 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pandas as pd
99
import pytest
1010

11-
from darts import TimeSeries, concatenate
11+
from darts import TimeSeries, concatenate, to_group_dataframe
1212
from darts.dataprocessing.transformers import BoxCox, Scaler
1313
from darts.tests.conftest import POLARS_AVAILABLE
1414
from darts.timeseries import (
@@ -135,6 +135,137 @@ def test_ts_from_x(self, tag, tmpdir_module):
135135
# Test without kwargs (new automatic serialization)
136136
self.helper_test_transfer(tag, ts, TimeSeries.from_json(ts_json))
137137

138+
@pytest.mark.parametrize("backend", TEST_BACKENDS)
139+
def test_to_dataframe_add_static_covariates(self, backend):
140+
"""Tests adding global as well as component specific static covariates to dataframe."""
141+
df = pd.DataFrame({
142+
"time": [0, 1, 2],
143+
"a": [1.0, 2.0, 3.0],
144+
"b": [4.0, 5.0, 6.0],
145+
})
146+
df = self.pd_to_backend(df, backend)
147+
static_covs = pd.DataFrame(
148+
{"sc1": ["a"], "sc2": ["b"]}, index=["global_components"]
149+
)
150+
series = TimeSeries.from_dataframe(
151+
df,
152+
time_col="time",
153+
static_covariates=static_covs,
154+
)
155+
assert series.static_covariates.equals(static_covs)
156+
157+
# sanity check that by default no static covs are added
158+
kwargs = {"backend": backend, "time_as_index": False}
159+
assert series.to_dataframe(**kwargs).equals(df)
160+
161+
# adding all static covariates
162+
df_out = nw.from_native(
163+
series.to_dataframe(**kwargs, add_static_covariates=True)
164+
).to_pandas()
165+
assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"]
166+
assert (df_out[["sc1", "sc2"]] == static_covs.values).all().all()
167+
168+
# adding a single column as string
169+
df_out = nw.from_native(
170+
series.to_dataframe(**kwargs, add_static_covariates="sc2")
171+
).to_pandas()
172+
assert df_out.columns.tolist() == ["time", "a", "b", "sc2"]
173+
assert (df_out[["sc2"]] == static_covs[["sc2"]].values).all().all()
174+
175+
# adding a list of columns with different order
176+
df_out = nw.from_native(
177+
series.to_dataframe(**kwargs, add_static_covariates=["sc2", "sc1"])
178+
).to_pandas()
179+
assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"]
180+
assert (
181+
(df_out[["sc2", "sc1"]] == static_covs[["sc2", "sc1"]].values).all().all()
182+
)
183+
184+
with pytest.raises(ValueError, match="`add_static_covariates` do not exist"):
185+
_ = series.to_dataframe(
186+
**kwargs, add_static_covariates=["does_not_exist", "sc1"]
187+
)
188+
189+
# component specific static covariates
190+
static_covs = pd.DataFrame(
191+
{"sc1": ["aa", "ab"], "sc2": ["ba", "bb"]}, index=["a", "b"]
192+
)
193+
series = TimeSeries.from_dataframe(
194+
df,
195+
time_col="time",
196+
static_covariates=static_covs,
197+
)
198+
assert series.static_covariates.equals(static_covs)
199+
df_out = nw.from_native(
200+
series.to_dataframe(**kwargs, add_static_covariates=True)
201+
).to_pandas()
202+
assert df_out.columns.tolist() == [
203+
"time",
204+
"a",
205+
"b",
206+
"sc1_a",
207+
"sc1_b",
208+
"sc2_a",
209+
"sc2_b",
210+
]
211+
assert (
212+
(
213+
df_out[["sc1_a", "sc1_b", "sc2_a", "sc2_b"]]
214+
== static_covs.values.flatten(order="K")
215+
)
216+
.all()
217+
.all()
218+
)
219+
220+
@pytest.mark.parametrize("backend", TEST_BACKENDS)
221+
def test_to_dataframe_add_metadata(self, backend):
222+
"""Tests adding metadata to dataframe."""
223+
df = pd.DataFrame({
224+
"time": [0, 1, 2],
225+
"a": [1.0, 2.0, 3.0],
226+
"b": [4.0, 5.0, 6.0],
227+
})
228+
df = self.pd_to_backend(df, backend)
229+
metadata = {"sc1": "a", "sc2": "b"}
230+
series = TimeSeries.from_dataframe(
231+
df,
232+
time_col="time",
233+
metadata=metadata,
234+
)
235+
assert series.metadata == metadata
236+
237+
# sanity check that by default no metadata are added
238+
kwargs = {"backend": backend, "time_as_index": False}
239+
assert series.to_dataframe(**kwargs).equals(df)
240+
241+
# adding all metadata
242+
df_out = nw.from_native(
243+
series.to_dataframe(**kwargs, add_metadata=True)
244+
).to_pandas()
245+
assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"]
246+
assert (df_out[["sc1", "sc2"]] == metadata.values()).all().all()
247+
248+
# adding a single column as string
249+
df_out = nw.from_native(
250+
series.to_dataframe(**kwargs, add_metadata="sc2")
251+
).to_pandas()
252+
assert df_out.columns.tolist() == ["time", "a", "b", "sc2"]
253+
assert (df_out[["sc2"]] == metadata["sc2"]).all().all()
254+
255+
# adding a list of columns with different order
256+
df_out = nw.from_native(
257+
series.to_dataframe(**kwargs, add_metadata=["sc2", "sc1"])
258+
).to_pandas()
259+
assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"]
260+
assert (
261+
(df_out[["sc2", "sc1"]] == [metadata[key] for key in ["sc2", "sc1"]])
262+
.all()
263+
.all()
264+
)
265+
266+
with pytest.raises(ValueError, match="`add_metadata` do not exist"):
267+
_ = series.to_dataframe(**kwargs, add_metadata=["does_not_exist", "sc1"])
268+
138269
def test_invalid_metadata(self):
139270
ts = linear_timeseries(length=10)
140271
with pytest.raises(ValueError) as exc:
@@ -180,6 +311,191 @@ def test_from_group_dataframe(self, config):
180311
assert (ts[0].values().flatten() == [values[2], values[1], values[0]]).all()
181312
assert (ts[1].values().flatten() == [values[3], values[4], values[5]]).all()
182313

314+
@pytest.mark.parametrize(
315+
"backend,time_as_index",
316+
itertools.product(
317+
TEST_BACKENDS,
318+
[True, False],
319+
),
320+
)
321+
def test_to_group_dataframe_creation(self, backend, time_as_index):
322+
df_pd = pd.DataFrame(
323+
data={
324+
"time": pd.date_range(start="2023-01-01", periods=10, freq="D"),
325+
"value": [float(i) for i in range(10)],
326+
"ID": [0.0] * 5 + [1.0] * 5,
327+
}
328+
)
329+
df = self.pd_to_backend(df_pd, backend)
330+
series = TimeSeries.from_group_dataframe(
331+
df,
332+
time_col="time",
333+
group_cols="ID",
334+
value_cols="value",
335+
)
336+
337+
if time_as_index and backend == "polars":
338+
time_as_index = False
339+
elif time_as_index:
340+
df_pd = df_pd.set_index("time")
341+
342+
reconstructed = to_group_dataframe(
343+
series,
344+
add_static_covariates=True,
345+
backend=backend,
346+
time_as_index=time_as_index,
347+
)
348+
reconstructed_pd = nw.from_native(reconstructed).to_pandas()
349+
expected = df_pd.sort_values(["ID", "time"])
350+
reconstructed_pd = reconstructed_pd.sort_values(["ID", "time"])
351+
assert reconstructed_pd.equals(expected)
352+
353+
@pytest.mark.parametrize(
354+
"add_metadata,metadata_cols",
355+
[
356+
(True, ["source", "version", "created"]),
357+
(["source", "version", "created"], ["source", "version", "created"]),
358+
("source", "source"),
359+
],
360+
)
361+
def test_to_group_dataframe_metadata_support(self, add_metadata, metadata_cols):
362+
df = pd.DataFrame({
363+
"value": [float(i) for i in range(10)] * 2,
364+
"ID": [0.0] * 10 + [1.0] * 10,
365+
})
366+
metadata = {
367+
"source": "test_data",
368+
"version": "1.0",
369+
"created": "2025-01-09",
370+
}
371+
for k, v in metadata.items():
372+
df[k] = v
373+
374+
if add_metadata is True:
375+
cols_to_include = ["value", "ID"] + list(metadata.keys())
376+
else:
377+
if isinstance(metadata_cols, str):
378+
metadata_cols = [metadata_cols]
379+
cols_to_include = ["value", "ID"] + metadata_cols
380+
381+
df_subset = df[cols_to_include]
382+
383+
ts_list = TimeSeries.from_group_dataframe(
384+
df_subset, group_cols="ID", metadata_cols=metadata_cols
385+
)
386+
reconstructed = to_group_dataframe(
387+
ts_list, add_static_covariates=True, add_metadata=add_metadata
388+
)
389+
390+
expected = df_subset.sort_values("ID")
391+
reconstructed = reconstructed.sort_values("ID")
392+
assert reconstructed.equals(expected)
393+
394+
@pytest.mark.parametrize(
395+
"add_static_cov,expected_cols",
396+
[
397+
(True, ["split", "set"]),
398+
(["split", "set", "ID"], ["split", "set"]),
399+
(["set", "ID"], "set"),
400+
],
401+
)
402+
def test_to_group_dataframe_global_static_cov_support(
403+
self, add_static_cov, expected_cols
404+
):
405+
df = pd.DataFrame({
406+
"value": [float(i) for i in range(10)] * 3,
407+
"ID": [0.0] * 10 + [1.0] * 10 + [2.0] * 10,
408+
"split": ["test"] * 10 + ["train"] * 20,
409+
"set": ["B"] * 20 + ["A"] * 10,
410+
})
411+
expected_cols = (
412+
[expected_cols] if isinstance(expected_cols, str) else expected_cols
413+
)
414+
df_subset = df[["value", "ID", *expected_cols]]
415+
ts_list = TimeSeries.from_group_dataframe(
416+
df_subset, group_cols="ID", static_cols=expected_cols
417+
)
418+
reconstructed = to_group_dataframe(
419+
ts_list, add_static_covariates=add_static_cov, add_metadata=True
420+
)
421+
expected = df_subset.sort_values(["ID"])
422+
reconstructed = reconstructed.sort_values(["ID"])
423+
reconstructed = reconstructed[expected.columns]
424+
assert reconstructed.equals(expected)
425+
426+
def test_to_group_dataframe_component_static_cov_support(self):
427+
df = pd.DataFrame({
428+
"time": pd.date_range("2023-01-01", periods=3, freq="D").tolist() * 2,
429+
"value1": [1.0, 2.0, 3.0] * 2,
430+
"value2": [4.0, 5.0, 6.0] * 2,
431+
"ID": [0.0] * 3 + [1.0] * 3,
432+
})
433+
static_covs = pd.DataFrame(
434+
{
435+
"source": ["sensor_A", "sensor_B"],
436+
"region": ["EU", "US"],
437+
},
438+
index=[0, 1],
439+
)
440+
441+
ts_list = [
442+
ts.with_static_covariates(static_covs)
443+
for ts in TimeSeries.from_group_dataframe(
444+
df,
445+
group_cols="ID",
446+
value_cols=["value1", "value2"],
447+
time_col="time",
448+
)
449+
]
450+
451+
reconstructed = to_group_dataframe(
452+
ts_list,
453+
add_static_covariates=True,
454+
add_metadata=False,
455+
time_as_index=False,
456+
)
457+
expected_cols = [
458+
"source_value1",
459+
"source_value2",
460+
"region_value1",
461+
"region_value2",
462+
]
463+
assert all(col in reconstructed.columns for col in expected_cols)
464+
465+
assert (reconstructed["source_value1"] == static_covs["source"].loc[0]).all()
466+
assert (reconstructed["source_value2"] == static_covs["source"].loc[1]).all()
467+
assert (reconstructed["region_value1"] == static_covs["region"].loc[0]).all()
468+
assert (reconstructed["region_value2"] == static_covs["region"].loc[1]).all()
469+
470+
@pytest.mark.parametrize("add_group_col", [True, "added_group_col"])
471+
def test_to_group_dataframe_add_group_col(self, add_group_col):
472+
df = pd.DataFrame({
473+
"time": pd.date_range("2023-01-01", periods=10, freq="D"),
474+
"value": np.arange(10).astype("float"),
475+
"ID": [0] * 5 + [1] * 5,
476+
})
477+
478+
ts = TimeSeries.from_group_dataframe(
479+
df,
480+
group_cols="ID",
481+
value_cols="value",
482+
time_col="time",
483+
)
484+
485+
reconstructed = to_group_dataframe(
486+
ts,
487+
add_static_covariates=False,
488+
add_metadata=False,
489+
add_group_col=add_group_col,
490+
time_as_index=False,
491+
)
492+
if not isinstance(add_group_col, str):
493+
add_group_col = "group"
494+
495+
assert "ID" not in reconstructed.columns
496+
reconstructed = reconstructed.rename(columns={add_group_col: "ID"})[df.columns]
497+
assert reconstructed.equals(df)
498+
183499
@pytest.mark.parametrize("backend", TEST_BACKENDS)
184500
def test_timeseries_from_longitudinal_df(self, backend):
185501
# univariate static covs: only group by "st1", keep static covs "st1"

0 commit comments

Comments
 (0)