Fix/max_samples_per_ts (#2987)

brunnedu · brunnedu-u8 · jakubchlapek · web-flow · commit 807d22c7c622 · 2026-01-23T08:39:59.000+01:00
* fix max_samples_per_ts not acting as upper bound; add test; update changelog

* Update CHANGELOG.md

Co-authored-by: Jakub Chłapek &lt;147340544+jakubchlapek@users.noreply.github.com&gt;

---------

Co-authored-by: Dustin Brunner &lt;dustin.brunner@unit8.co&gt;
Co-authored-by: Jakub Chłapek &lt;147340544+jakubchlapek@users.noreply.github.com&gt;
Co-authored-by: Dennis Bader &lt;dennis.bader@gmx.ch&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 
 **Fixed**
 
+- Fixed a bug in `TorchTrainingDataset` where `max_samples_per_ts` was not acting as an upper bound on the number of samples per time series. Now `max_samples_per_ts` correctly acts as an upper bound, capping the dataset size at the actual number of samples that can be extracted from the longest series. [#2987](https://github.com/unit8co/darts/pull/2987) by [Dustin Brunner](https://github.com/brunnedu).
 - Updated s(m)ape to not raise a ValueError when actuals and predictions are zero for the same timestep. [#2984](https://github.com/unit8co/darts/pull/2984) by [eschibli](https://github.com/eschibli).
 
 **Dependencies**
diff --git a/darts/tests/utils/torch_datasets/test_torch_datasets.py b/darts/tests/utils/torch_datasets/test_torch_datasets.py
@@ -881,6 +881,70 @@ def test_horizon_training_dataset_invalid_lh(self):
             "with `1 <= min_lh <= max_lh`."
         )
 
+    def test_max_samples_per_ts_upper_bound(self):
+        # Use cov1 with length 100
+        series = self.cov1
+
+        # With input_chunk_length=11, output_chunk_length=13, and shift=24
+        # size_of_both_chunks = max(11, 24 + 13) = 37
+        # actual extractable samples = 100 - 37 + 1 = 64
+
+        # Case 1: max_samples_per_ts=None should extract all 64 samples
+        ds_no_limit = ShiftedTorchTrainingDataset(
+            series=series,
+            input_chunk_length=11,
+            output_chunk_length=13,
+            shift=24,
+            max_samples_per_ts=None,
+        )
+        assert len(ds_no_limit) == 64
+
+        # Case 2: max_samples_per_ts > actual max should cap at actual max (64)
+        ds_high_limit = ShiftedTorchTrainingDataset(
+            series=series,
+            input_chunk_length=11,
+            output_chunk_length=13,
+            shift=24,
+            max_samples_per_ts=5000,  # Much higher than 64
+        )
+        # Should be capped at 64, not 5000
+        assert len(ds_high_limit) == 64
+
+        # Case 3: max_samples_per_ts < actual max should use the limit
+        ds_low_limit = ShiftedTorchTrainingDataset(
+            series=series,
+            input_chunk_length=11,
+            output_chunk_length=13,
+            shift=24,
+            max_samples_per_ts=50,
+        )
+        assert len(ds_low_limit) == 50
+
+        # Case 4: Test with stride > 1
+        # actual extractable samples with stride=2 = ceil(64 / 2) = 32
+        ds_stride = ShiftedTorchTrainingDataset(
+            series=series,
+            input_chunk_length=11,
+            output_chunk_length=13,
+            shift=24,
+            stride=2,
+            max_samples_per_ts=100,
+        )
+        assert len(ds_stride) == 32
+
+        # Case 5: Multiple series with different lengths
+        series1 = gaussian_timeseries(length=50)  # 50 - 37 + 1 = 14 samples
+        series2 = gaussian_timeseries(length=100)  # 100 - 37 + 1 = 64 samples
+        ds_multi = ShiftedTorchTrainingDataset(
+            series=[series1, series2],
+            input_chunk_length=11,
+            output_chunk_length=13,
+            shift=24,
+            max_samples_per_ts=5000,
+        )
+        # Should be capped at 64 (max of both series), so 2 * 64 = 128
+        assert len(ds_multi) == 2 * 64
+
     def test_past_covariates_sequential_dataset(self):
         # one target series
         ds = SequentialTorchTrainingDataset(
diff --git a/darts/utils/data/torch_datasets/training_dataset.py b/darts/utils/data/torch_datasets/training_dataset.py
@@ -170,19 +170,24 @@ def __init__(
 
         size_of_both_chunks = max(input_chunk_length, shift + output_chunk_length)
 
-        # setup samples
-        if max_samples_per_ts is None:
-            # read all time series to get the maximum size
-            max_samples_per_ts = max(len(ts) for ts in series) - size_of_both_chunks + 1
-            if max_samples_per_ts <= 0:
-                raise_log(
-                    ValueError(
-                        f"The input `series` are too short to extract even a single sample. "
-                        f"Expected min length: `{size_of_both_chunks}`, received max length: "
-                        f"`{max_samples_per_ts + size_of_both_chunks - 1}`."
-                    )
+        # compute the maximum available samples over all series
+        max_available_indices = max(len(ts) for ts in series) - size_of_both_chunks + 1
+        max_available_samples = ceil(max_available_indices / stride)
+
+        if max_available_indices <= 0:
+            raise_log(
+                ValueError(
+                    f"The input `series` are too short to extract even a single sample. "
+                    f"Expected min length: `{size_of_both_chunks}`, received max length: "
+                    f"`{max(len(ts) for ts in series)}`."
                 )
-            max_samples_per_ts = ceil(max_samples_per_ts / stride)
+            )
+
+        if max_samples_per_ts is None:
+            max_samples_per_ts = max_available_samples
+        else:
+            # upper bound maximum available samples by max_samples_per_ts
+            max_samples_per_ts = min(max_samples_per_ts, max_available_samples)
 
         self.input_chunk_length = input_chunk_length
         self.output_chunk_length = output_chunk_length