Feat/add to group dataframe() function (#3000)

Jaco-Pastorius · jakubchlapek · dennisbader · web-flow · commit d3a57dd0d19c · 2026-02-07T13:43:47.000+01:00
* Initial commit

* Datasets deleted

* test modif

* Set default add_static_cov to False

* Updated CHANGELOG.md and added docstring in to_group_dataframe()

* modified CHANGELOG.md

* reverted changes in example 15

* Feedback implementation, code coverage fixed, add of add_group_col argument

* error clarification

* docstring clarification

* Added condition for custom group column name

* minor updates

---------

Co-authored-by: Jakub Chłapek &lt;147340544+jakubchlapek@users.noreply.github.com&gt;
Co-authored-by: dennisbader &lt;dennis.bader@gmx.ch&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,8 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
   - Includes automatic downsampling for large series (configurable via `downsample_threshold` parameter) to avoid crashes when plotting large series
   - Integrates seamlessly with `plotting.use_darts_style` which now affects both `TimeSeries.plot()` and `TimeSeries.plotly()`
   - Plotly remains an optional dependency and can be installed with `pip install plotly`
+- `TimeSeries.to_dataframe()` now supports fine-grained control over adding the time series' metadata and static covariates to the output DataFrame with the new parameters `add_static_cov` and `add_metadata`. [#2965](https://github.com/unit8co/darts/issues/2965) by [Gabriel Margaria](https://github.com/Jaco-Pastorius).
+- Added method `TimeSeries.to_group_dataframe()` to convert a list of time series into a long DataFrame (pandas, polars, ...) - e.g. into the DataFrame format that is used as input for `from_group_dataframe()`. [#2965](https://github.com/unit8co/darts/issues/2965) by [Gabriel Margaria](https://github.com/Jaco-Pastorius).
 
 **Fixed**
 
@@ -41,9 +43,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 - We set an upper version cap on `pandas<3.0.0` until we officially support it. [#2995](https://github.com/unit8co/darts/pull/2995) by [Dennis Bader](https://github.com/dennisbader).
 
 ### For developers of the library:
-- Removed unit tests from the package distribution (PyPI and conda-forge) to reduce package size. [#2979](https://github.com/unit8co/darts/pull/2979) and [u8darts-feedstock#59](https://github.com/conda-forge/u8darts-feedstock/pull/59) by [Zhihao Dai](https://github.com/daidahao).
 
 - Migrated the dependency management and tooling to [uv](https://docs.astral.sh/uv/). Use `uv sync --group dev-all` to setup your development environment; See the [contribution guide](https://github.com/unit8co/darts/blob/master/CONTRIBUTING.md) for further information. [#2993](https://github.com/unit8co/darts/pull/2993) by [Jules Authier](https://github.com/authierj).
+- Removed unit tests from the package distribution (PyPI and conda-forge) to reduce package size. [#2979](https://github.com/unit8co/darts/pull/2979) and [u8darts-feedstock#59](https://github.com/conda-forge/u8darts-feedstock/pull/59) by [Zhihao Dai](https://github.com/daidahao).
 
 ## [0.40.0](https://github.com/unit8co/darts/tree/0.40.0) (2025-12-23)
 
diff --git a/darts/__init__.py b/darts/__init__.py
@@ -12,14 +12,20 @@
     reset_option,
     set_option,
 )
-from darts.timeseries import TimeSeries, concatenate, slice_intersect
+from darts.timeseries import (
+    TimeSeries,
+    concatenate,
+    slice_intersect,
+    to_group_dataframe,
+)
 
 __version__ = "0.40.0"
 
 __all__ = [
     "TimeSeries",
     "concatenate",
     "slice_intersect",
+    "to_group_dataframe",
     "get_option",
     "set_option",
     "reset_option",
diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py
@@ -8,7 +8,7 @@
 import pandas as pd
 import pytest
 
-from darts import TimeSeries, concatenate
+from darts import TimeSeries, concatenate, to_group_dataframe
 from darts.dataprocessing.transformers import BoxCox, Scaler
 from darts.tests.conftest import POLARS_AVAILABLE
 from darts.timeseries import (
@@ -135,6 +135,137 @@ def test_ts_from_x(self, tag, tmpdir_module):
         # Test without kwargs (new automatic serialization)
         self.helper_test_transfer(tag, ts, TimeSeries.from_json(ts_json))
 
+    @pytest.mark.parametrize("backend", TEST_BACKENDS)
+    def test_to_dataframe_add_static_covariates(self, backend):
+        """Tests adding global as well as component specific static covariates to dataframe."""
+        df = pd.DataFrame({
+            "time": [0, 1, 2],
+            "a": [1.0, 2.0, 3.0],
+            "b": [4.0, 5.0, 6.0],
+        })
+        df = self.pd_to_backend(df, backend)
+        static_covs = pd.DataFrame(
+            {"sc1": ["a"], "sc2": ["b"]}, index=["global_components"]
+        )
+        series = TimeSeries.from_dataframe(
+            df,
+            time_col="time",
+            static_covariates=static_covs,
+        )
+        assert series.static_covariates.equals(static_covs)
+
+        # sanity check that by default no static covs are added
+        kwargs = {"backend": backend, "time_as_index": False}
+        assert series.to_dataframe(**kwargs).equals(df)
+
+        # adding all static covariates
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_static_covariates=True)
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"]
+        assert (df_out[["sc1", "sc2"]] == static_covs.values).all().all()
+
+        # adding a single column as string
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_static_covariates="sc2")
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc2"]
+        assert (df_out[["sc2"]] == static_covs[["sc2"]].values).all().all()
+
+        # adding a list of columns with different order
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_static_covariates=["sc2", "sc1"])
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"]
+        assert (
+            (df_out[["sc2", "sc1"]] == static_covs[["sc2", "sc1"]].values).all().all()
+        )
+
+        with pytest.raises(ValueError, match="`add_static_covariates` do not exist"):
+            _ = series.to_dataframe(
+                **kwargs, add_static_covariates=["does_not_exist", "sc1"]
+            )
+
+        # component specific static covariates
+        static_covs = pd.DataFrame(
+            {"sc1": ["aa", "ab"], "sc2": ["ba", "bb"]}, index=["a", "b"]
+        )
+        series = TimeSeries.from_dataframe(
+            df,
+            time_col="time",
+            static_covariates=static_covs,
+        )
+        assert series.static_covariates.equals(static_covs)
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_static_covariates=True)
+        ).to_pandas()
+        assert df_out.columns.tolist() == [
+            "time",
+            "a",
+            "b",
+            "sc1_a",
+            "sc1_b",
+            "sc2_a",
+            "sc2_b",
+        ]
+        assert (
+            (
+                df_out[["sc1_a", "sc1_b", "sc2_a", "sc2_b"]]
+                == static_covs.values.flatten(order="K")
+            )
+            .all()
+            .all()
+        )
+
+    @pytest.mark.parametrize("backend", TEST_BACKENDS)
+    def test_to_dataframe_add_metadata(self, backend):
+        """Tests adding metadata to dataframe."""
+        df = pd.DataFrame({
+            "time": [0, 1, 2],
+            "a": [1.0, 2.0, 3.0],
+            "b": [4.0, 5.0, 6.0],
+        })
+        df = self.pd_to_backend(df, backend)
+        metadata = {"sc1": "a", "sc2": "b"}
+        series = TimeSeries.from_dataframe(
+            df,
+            time_col="time",
+            metadata=metadata,
+        )
+        assert series.metadata == metadata
+
+        # sanity check that by default no metadata are added
+        kwargs = {"backend": backend, "time_as_index": False}
+        assert series.to_dataframe(**kwargs).equals(df)
+
+        # adding all metadata
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_metadata=True)
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc1", "sc2"]
+        assert (df_out[["sc1", "sc2"]] == metadata.values()).all().all()
+
+        # adding a single column as string
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_metadata="sc2")
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc2"]
+        assert (df_out[["sc2"]] == metadata["sc2"]).all().all()
+
+        # adding a list of columns with different order
+        df_out = nw.from_native(
+            series.to_dataframe(**kwargs, add_metadata=["sc2", "sc1"])
+        ).to_pandas()
+        assert df_out.columns.tolist() == ["time", "a", "b", "sc2", "sc1"]
+        assert (
+            (df_out[["sc2", "sc1"]] == [metadata[key] for key in ["sc2", "sc1"]])
+            .all()
+            .all()
+        )
+
+        with pytest.raises(ValueError, match="`add_metadata` do not exist"):
+            _ = series.to_dataframe(**kwargs, add_metadata=["does_not_exist", "sc1"])
+
     def test_invalid_metadata(self):
         ts = linear_timeseries(length=10)
         with pytest.raises(ValueError) as exc:
@@ -180,6 +311,191 @@ def test_from_group_dataframe(self, config):
         assert (ts[0].values().flatten() == [values[2], values[1], values[0]]).all()
         assert (ts[1].values().flatten() == [values[3], values[4], values[5]]).all()
 
+    @pytest.mark.parametrize(
+        "backend,time_as_index",
+        itertools.product(
+            TEST_BACKENDS,
+            [True, False],
+        ),
+    )
+    def test_to_group_dataframe_creation(self, backend, time_as_index):
+        df_pd = pd.DataFrame(
+            data={
+                "time": pd.date_range(start="2023-01-01", periods=10, freq="D"),
+                "value": [float(i) for i in range(10)],
+                "ID": [0.0] * 5 + [1.0] * 5,
+            }
+        )
+        df = self.pd_to_backend(df_pd, backend)
+        series = TimeSeries.from_group_dataframe(
+            df,
+            time_col="time",
+            group_cols="ID",
+            value_cols="value",
+        )
+
+        if time_as_index and backend == "polars":
+            time_as_index = False
+        elif time_as_index:
+            df_pd = df_pd.set_index("time")
+
+        reconstructed = to_group_dataframe(
+            series,
+            add_static_covariates=True,
+            backend=backend,
+            time_as_index=time_as_index,
+        )
+        reconstructed_pd = nw.from_native(reconstructed).to_pandas()
+        expected = df_pd.sort_values(["ID", "time"])
+        reconstructed_pd = reconstructed_pd.sort_values(["ID", "time"])
+        assert reconstructed_pd.equals(expected)
+
+    @pytest.mark.parametrize(
+        "add_metadata,metadata_cols",
+        [
+            (True, ["source", "version", "created"]),
+            (["source", "version", "created"], ["source", "version", "created"]),
+            ("source", "source"),
+        ],
+    )
+    def test_to_group_dataframe_metadata_support(self, add_metadata, metadata_cols):
+        df = pd.DataFrame({
+            "value": [float(i) for i in range(10)] * 2,
+            "ID": [0.0] * 10 + [1.0] * 10,
+        })
+        metadata = {
+            "source": "test_data",
+            "version": "1.0",
+            "created": "2025-01-09",
+        }
+        for k, v in metadata.items():
+            df[k] = v
+
+        if add_metadata is True:
+            cols_to_include = ["value", "ID"] + list(metadata.keys())
+        else:
+            if isinstance(metadata_cols, str):
+                metadata_cols = [metadata_cols]
+            cols_to_include = ["value", "ID"] + metadata_cols
+
+        df_subset = df[cols_to_include]
+
+        ts_list = TimeSeries.from_group_dataframe(
+            df_subset, group_cols="ID", metadata_cols=metadata_cols
+        )
+        reconstructed = to_group_dataframe(
+            ts_list, add_static_covariates=True, add_metadata=add_metadata
+        )
+
+        expected = df_subset.sort_values("ID")
+        reconstructed = reconstructed.sort_values("ID")
+        assert reconstructed.equals(expected)
+
+    @pytest.mark.parametrize(
+        "add_static_cov,expected_cols",
+        [
+            (True, ["split", "set"]),
+            (["split", "set", "ID"], ["split", "set"]),
+            (["set", "ID"], "set"),
+        ],
+    )
+    def test_to_group_dataframe_global_static_cov_support(
+        self, add_static_cov, expected_cols
+    ):
+        df = pd.DataFrame({
+            "value": [float(i) for i in range(10)] * 3,
+            "ID": [0.0] * 10 + [1.0] * 10 + [2.0] * 10,
+            "split": ["test"] * 10 + ["train"] * 20,
+            "set": ["B"] * 20 + ["A"] * 10,
+        })
+        expected_cols = (
+            [expected_cols] if isinstance(expected_cols, str) else expected_cols
+        )
+        df_subset = df[["value", "ID", *expected_cols]]
+        ts_list = TimeSeries.from_group_dataframe(
+            df_subset, group_cols="ID", static_cols=expected_cols
+        )
+        reconstructed = to_group_dataframe(
+            ts_list, add_static_covariates=add_static_cov, add_metadata=True
+        )
+        expected = df_subset.sort_values(["ID"])
+        reconstructed = reconstructed.sort_values(["ID"])
+        reconstructed = reconstructed[expected.columns]
+        assert reconstructed.equals(expected)
+
+    def test_to_group_dataframe_component_static_cov_support(self):
+        df = pd.DataFrame({
+            "time": pd.date_range("2023-01-01", periods=3, freq="D").tolist() * 2,
+            "value1": [1.0, 2.0, 3.0] * 2,
+            "value2": [4.0, 5.0, 6.0] * 2,
+            "ID": [0.0] * 3 + [1.0] * 3,
+        })
+        static_covs = pd.DataFrame(
+            {
+                "source": ["sensor_A", "sensor_B"],
+                "region": ["EU", "US"],
+            },
+            index=[0, 1],
+        )
+
+        ts_list = [
+            ts.with_static_covariates(static_covs)
+            for ts in TimeSeries.from_group_dataframe(
+                df,
+                group_cols="ID",
+                value_cols=["value1", "value2"],
+                time_col="time",
+            )
+        ]
+
+        reconstructed = to_group_dataframe(
+            ts_list,
+            add_static_covariates=True,
+            add_metadata=False,
+            time_as_index=False,
+        )
+        expected_cols = [
+            "source_value1",
+            "source_value2",
+            "region_value1",
+            "region_value2",
+        ]
+        assert all(col in reconstructed.columns for col in expected_cols)
+
+        assert (reconstructed["source_value1"] == static_covs["source"].loc[0]).all()
+        assert (reconstructed["source_value2"] == static_covs["source"].loc[1]).all()
+        assert (reconstructed["region_value1"] == static_covs["region"].loc[0]).all()
+        assert (reconstructed["region_value2"] == static_covs["region"].loc[1]).all()
+
+    @pytest.mark.parametrize("add_group_col", [True, "added_group_col"])
+    def test_to_group_dataframe_add_group_col(self, add_group_col):
+        df = pd.DataFrame({
+            "time": pd.date_range("2023-01-01", periods=10, freq="D"),
+            "value": np.arange(10).astype("float"),
+            "ID": [0] * 5 + [1] * 5,
+        })
+
+        ts = TimeSeries.from_group_dataframe(
+            df,
+            group_cols="ID",
+            value_cols="value",
+            time_col="time",
+        )
+
+        reconstructed = to_group_dataframe(
+            ts,
+            add_static_covariates=False,
+            add_metadata=False,
+            add_group_col=add_group_col,
+            time_as_index=False,
+        )
+        if not isinstance(add_group_col, str):
+            add_group_col = "group"
+
+        assert "ID" not in reconstructed.columns
+        reconstructed = reconstructed.rename(columns={add_group_col: "ID"})[df.columns]
+        assert reconstructed.equals(df)
+
     @pytest.mark.parametrize("backend", TEST_BACKENDS)
     def test_timeseries_from_longitudinal_df(self, backend):
         # univariate static covs: only group by "st1", keep static covs "st1"
diff --git a/darts/timeseries.py b/darts/timeseries.py