Fix StreamingDataset len after drop_last update (#778)

MagellaX · web-flow · commit c2252a63d6b6 · 2026-01-07T09:21:18.000+01:00
diff --git a/src/litdata/streaming/dataset.py b/src/litdata/streaming/dataset.py
@@ -223,10 +223,32 @@ def on_demand_bytes(self, value: bool) -> None:
         self.cache._reader.on_demand_bytes = value
 
     def set_shuffle(self, shuffle: bool) -> None:
-        self.shuffle = shuffle
+        """Set the shuffle parameter.
+
+        Invalidates the shuffler cache when the parameter changes to ensure
+        subsequent length calculations reflect the new shuffle setting.
+
+        Args:
+            shuffle: Whether to shuffle the dataset.
+
+        """
+        if self.shuffle != shuffle:
+            self.shuffle = shuffle
+            self.shuffler = None  # Reset shuffler to pick up new shuffle setting
 
     def set_drop_last(self, drop_last: bool) -> None:
-        self.drop_last = drop_last
+        """Set the drop_last parameter.
+
+        Invalidates the shuffler cache when the parameter changes to ensure
+        subsequent length calculations reflect the new drop_last setting.
+
+        Args:
+            drop_last: Whether to drop the last incomplete batch.
+
+        """
+        if self.drop_last != drop_last:
+            self.drop_last = drop_last
+            self.shuffler = None  # Reset shuffler to pick up new drop_last setting
 
     def set_epoch(self, current_epoch: int) -> None:
         """Set the current epoch to the dataset on epoch starts.
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
@@ -544,6 +544,35 @@ def test_dataset_cache_recreation(tmpdir):
     assert dataset.shuffler is shuffler  # shuffler gets reused
 
 
+@pytest.mark.timeout(30)
+def test_len_called_before_dataloader_drop_last(tmpdir):
+    cache = Cache(str(tmpdir), chunk_size=10)
+    for i in range(100):
+        cache[i] = i
+    cache.done()
+    cache.merge()
+
+    dataset = StreamingDataset(input_dir=str(tmpdir), shuffle=False)
+    _ = len(dataset)
+
+    batch_size = 8
+    dataloader = StreamingDataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=4,
+        drop_last=True,
+        shuffle=False,
+    )
+
+    expected_batches = len(dataloader)
+    batches = list(dataloader)
+
+    # With drop_last=True and 100 items: 100 // 8 = 12 full batches (4 items dropped)
+    assert expected_batches == 12
+    assert len(batches) == expected_batches
+    assert all(len(batch) == batch_size for batch in batches)
+
+
 def test_dataset_for_text_tokens(tmpdir):
     seed_everything(42)