feat(streaming): enable per-dataset batch-sizes in CombinedStreamingDataset (Lightning-AI#635)

MagellaX · pre-commit-ci[bot] · web-flow · commit fa2020ec4cf0 · 2025-07-16T18:20:19.000+01:00
* feat(streaming): per-dataset batch-size support in CombinedStreamingDataset (Lightning-AI#327) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat(streaming): add per-dataset batch-size support and fix mypy issues * fix(streaming): always switch dataset once per-stream quota is met * chore(typing): align batch_size annotation with Union[int, Sequence[int]] * fix(typing): ensure int batch_size passed to get_len for mypy * chore(typing): remove redundant casts flagged by mypy * style(ruff): replace typing.List/Dict with built-in generics * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/src/litdata/streaming/combined.py b/src/litdata/streaming/combined.py
@@ -15,7 +15,7 @@
 import random
 from collections.abc import Iterator, Sequence
 from copy import deepcopy
-from typing import Any, Literal, Optional
+from typing import Any, Literal, Optional, Union
 
 from litdata.debugger import ChromeTraceColors, _get_log_msg
 from litdata.streaming.dataset import StreamingDataset
@@ -170,7 +170,7 @@ def __init__(
         weights: Sequence[Optional[float]],
         use_streaming_dataloader: bool,
         num_samples_yielded: Any,
-        batch_size: int,
+        batch_size: Union[int, Sequence[int]],
         batching_method: BatchingMethodType,
         iterate_over_all: bool = False,
     ) -> None:
@@ -183,7 +183,14 @@ def __init__(
         self._rng = random.Random(seed)  # noqa: S311
         self._iterate_over_all = iterate_over_all
         self._batching_method = batching_method
+        # Batch size can be an int (applied to all datasets) or a sequence providing
+        # a specific batch size per dataset.
         self._batch_size = batch_size
+        from collections.abc import Sequence as _Sequence
+
+        # Validate when a sequence is provided
+        if isinstance(batch_size, _Sequence) and len(batch_size) != len(datasets):
+            raise ValueError("When providing a sequence of batch sizes, its length must match the number of datasets.")
         self._is_done = False
 
         if num_samples_yielded is not None:
@@ -196,9 +203,10 @@ def __init__(
         self._use_streaming_dataloader = use_streaming_dataloader
         self._is_done = False
 
-        # Used to track the number of samples yielded in the current batch
-        # and the current dataset index
-        # This is used only when batching_method is set to "per_stream"
+        # Track the number of samples yielded in the current (DataLoader) batch
+        # and the active dataset index. This is used only when batching_method is
+        # set to "per_stream".  With per-dataset batch sizes we look up the limit
+        # dynamically based on ``self._batch_size`` if it is a sequence.
         self._samples_yielded_in_batch = 0
         self._cur_dataset_index = -1
 
@@ -240,11 +248,35 @@ def _get_dataset_index(self) -> int:
             # For every sample, randomly select a dataset (weighted)
             dataset_idx = self._set_new_dataset_index()
         elif self._batching_method == BatchingMethod.PER_STREAM:
-            # For each batch, pick a dataset and stick with it for the whole batch
-            if self._cur_dataset_index == -1 or self._samples_yielded_in_batch >= self._batch_size:
+            # For each batch, pick a dataset and stick with it until the
+            # desired number of samples for that dataset have been yielded.
+
+            from collections.abc import Sequence as _Sequence
+
+            if self._cur_dataset_index == -1:
+                # Start of iteration or after switching dataset
                 self._cur_dataset_index = self._set_new_dataset_index()
                 self._samples_yielded_in_batch = 0
+
             dataset_idx = self._cur_dataset_index
+
+            # Determine the batch-size limit for the current dataset
+            limit = self._batch_size[dataset_idx] if isinstance(self._batch_size, _Sequence) else self._batch_size
+
+            if self._samples_yielded_in_batch >= limit:
+                # Current dataset reached its quota; pick a *different* dataset if possible
+                candidate_idx = self._cur_dataset_index
+                if len([i for i in self._dataset_indexes if i is not None]) > 1:
+                    while candidate_idx == self._cur_dataset_index:
+                        candidate_idx = self._set_new_dataset_index()
+                # Update tracking
+                self._cur_dataset_index = candidate_idx
+                self._samples_yielded_in_batch = 0
+                dataset_idx = self._cur_dataset_index
+                # Re-compute limit for the new dataset
+                if isinstance(self._batch_size, _Sequence):
+                    limit = self._batch_size[dataset_idx]
+
             self._samples_yielded_in_batch += 1
         else:
             raise ValueError(f"Invalid batching method: {self._batching_method}")
diff --git a/src/litdata/streaming/parallel.py b/src/litdata/streaming/parallel.py
@@ -250,7 +250,13 @@ def __iter__(self) -> Iterator[Any]:
         return self._iterator
 
     def __len__(self) -> Optional[int]:
-        return self.get_len(self.num_workers, self.batch_size if self.batch_size else 1)
+        # ``batch_size`` may be a sequence when per-dataset values were set on
+        # the wrapper.  For length estimation we only need a scalar; we take
+        # the first element if a sequence is provided.
+        from collections.abc import Sequence
+
+        bs_int: int = int(self.batch_size[0]) if isinstance(self.batch_size, Sequence) else int(self.batch_size)
+        return self.get_len(self.num_workers, bs_int if bs_int else 1)
 
     def get_num_samples_yielded(
         self,
diff --git a/src/litdata/utilities/base.py b/src/litdata/utilities/base.py
@@ -12,8 +12,8 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from collections.abc import Iterator
-from typing import Any, Optional
+from collections.abc import Iterator, Sequence
+from typing import Any, Optional, Union
 
 from torch.utils.data import IterableDataset
 
@@ -30,7 +30,7 @@ class _BaseStreamingDatasetWrapper(IterableDataset, ABC):
 
     _datasets: list[StreamingDataset]
     _current_epoch: int
-    batch_size: int
+    batch_size: Union[int, Sequence[int]]
     num_workers: int
     _force_override_state_dict: bool
     _use_streaming_dataloader: bool
@@ -41,11 +41,31 @@ def set_shuffle(self, shuffle: bool) -> None:
         for dataset in self._datasets:
             dataset.set_shuffle(shuffle)
 
-    def set_batch_size(self, batch_size: int) -> None:
-        """Set the current batch size to the datasets."""
-        self.batch_size = batch_size
-        for dataset in self._datasets:
-            dataset.set_batch_size(batch_size)
+    def set_batch_size(self, batch_size: Union[int, Sequence[int]]) -> None:
+        """Set the current batch size.
+
+        This method now supports either:
+
+        1. a single ``int`` applied to all wrapped datasets (previous behaviour), or
+        2. a ``Sequence[int]`` that specifies one batch size per wrapped dataset.
+
+        The length of the sequence must match the number of wrapped datasets.
+        """
+        # Defer the import to avoid overhead when not required
+        from collections.abc import Sequence
+
+        self.batch_size = batch_size  # store as-is for later access
+
+        if isinstance(batch_size, Sequence):
+            if len(batch_size) != len(self._datasets):
+                raise ValueError(
+                    "The length of `batch_size` must match the number of datasets when passing a sequence."
+                )
+            for bs, dataset in zip(batch_size, self._datasets):
+                dataset.set_batch_size(bs)
+        else:
+            for dataset in self._datasets:
+                dataset.set_batch_size(int(batch_size))
 
     def set_num_workers(self, num_workers: int) -> None:
         """Set the current number of workers to the datasets."""
@@ -97,8 +117,20 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
             self._num_samples_yielded = state_dict["num_samples_yielded"]
 
     def _get_len(self, d: Any) -> int:
+        # mypy: ``self.batch_size`` can be a ``Sequence[int]`` now, but the
+        # underlying datasets still expect a plain ``int`` for their
+        # ``get_len`` signature.  We pass an `int` in both cases and use the
+        # first element of the sequence when a per-dataset list is provided.
+
+        from collections.abc import Sequence
+
+        if isinstance(self.batch_size, Sequence):
+            bs_int: int = int(self.batch_size[0] if self.batch_size else 1)
+        else:
+            bs_int = int(self.batch_size)
+
         if isinstance(d, StreamingDataset):
-            return d.get_len(self.num_workers, self.batch_size)
+            return d.get_len(self.num_workers, bs_int)
         return len(d)
 
     @abstractmethod
diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py
@@ -596,3 +596,63 @@ def test_combined_dataset_dataloader_states_partial_iterations(combined_dataset,
         assert dataloader.current_epoch == 2, "Current epoch should be 2 in the second iteration"
         samples_yielded += len(batch)
     assert samples_yielded == len(combined_dataset), "All samples should be yielded in the second epoch."
+
+
+# -----------------------------------------------------------------------------
+# New tests: per-dataset batch sizes with batching_method="per_stream"
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("batch_sizes", [[1, 2], [2, 3]])
+def test_combined_dataset_per_dataset_batch_size(batch_sizes):
+    """Validate that when individual batch sizes are provided for each inner dataset.
+
+    The iterator respects these limits when *batching_method='per_stream'*.
+    """
+    # Build two trivial iterable datasets that produce easily distinguishable values
+    dataset1 = SimpleDataset(0, 200)  # dataset 0 values 0-199
+    dataset2 = SimpleDataset(1000, 1200)  # dataset 1 values 1000-1199
+
+    cds = TestCombinedStreamingDataset(
+        datasets=[dataset1, dataset2],
+        weights=[0.5, 0.5],
+        batching_method="per_stream",
+        iterate_over_all=False,
+        seed=123,
+    )
+
+    # Apply the per-dataset batch sizes
+    cds.set_batch_size(batch_sizes)
+
+    # Iterate a reasonable number of samples to observe several switches
+    num_samples = 300
+    iterator = iter(cds)
+
+    # Helper to map value -> dataset index
+    def get_ds_id(val):
+        return 0 if val < 1000 else 1
+
+    current_ds = None
+    run_length = 0
+
+    for _ in range(num_samples):
+        val = next(iterator)
+        ds_id = get_ds_id(val)
+
+        if current_ds is None:
+            # first sample
+            current_ds = ds_id
+            run_length = 1
+        elif ds_id == current_ds:
+            run_length += 1
+        else:
+            # dataset switch – verify previous run respected its quota
+            assert run_length <= batch_sizes[current_ds], (
+                f"Dataset {current_ds} emitted {run_length} consecutive samples (limit {batch_sizes[current_ds]})"
+            )
+            current_ds = ds_id
+            run_length = 1
+
+    # Final run check at loop end
+    if current_ds is not None:
+        assert run_length <= batch_sizes[current_ds]