Fix: redundant chunk index download request in BinaryReader , when dataset in iter mode (#535)

bhimrazy · pre-commit-ci[bot] · web-flow · commit ee033833589c · 2025-03-30T02:24:32.000+05:45
* Remove redundant chunk index download request in BinaryReader * update the condition * Reset last chunk index and queued download state on close * add test case for dataset as iterator and non iterator * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix typo * update comment for clarity on chunk download conditions --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/src/litdata/streaming/reader.py b/src/litdata/streaming/reader.py
@@ -299,6 +299,7 @@ def __init__(
         self._prepare_thread: Optional[PrepareChunksThread] = None
         self._item_loader = item_loader or PyTreeLoader()
         self._last_chunk_index: Optional[int] = None
+        self._chunks_queued_for_download = False
         self._max_cache_size = int(os.getenv("MAX_CACHE_SIZE", max_cache_size or 0))
         self._storage_options = storage_options
         self._max_pre_download = max_pre_download
@@ -368,9 +369,12 @@ def read(self, index: ChunkedIndex) -> Any:
                 self._prepare_thread.start()
                 if index.chunk_indexes:
                     self._prepare_thread.download(index.chunk_indexes)
+                    self._chunks_queued_for_download = True
 
-            # If the chunk_index is new, request for it to be downloaded.
-            if index.chunk_index != self._last_chunk_index:
+            # Only request individual chunk download if:
+            # 1. We haven't already queued all chunks for the download
+            # 2. We're processing a new chunk (different from the last one)
+            if not self._chunks_queued_for_download and index.chunk_index != self._last_chunk_index:
                 assert self._prepare_thread
                 self._prepare_thread.download([index.chunk_index])
 
@@ -417,6 +421,8 @@ def read(self, index: ChunkedIndex) -> Any:
             self._prepare_thread.stop()
             self._prepare_thread = None
             self._item_loader.close(self._last_chunk_index)
+            self._last_chunk_index = None
+            self._chunks_queued_for_download = False
 
         return item
 
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py
@@ -1505,3 +1505,57 @@ def mock_read(self, index):
     assert len(indexes) == 1, "Expected exactly one index with is_last_index=True"
     assert indexes[0].is_last_index, "Expected is_last_index=True for the last item"
     assert indexes[0].chunk_index == worker_chunks[-1], "Expected to match the last chunk"
+
+
+@pytest.mark.parametrize("local", [True, False])
+@pytest.mark.parametrize("shuffle", [True, False])
+def test_dataset_as_iterator_and_non_iterator(tmpdir, local, shuffle):
+    """Test that _chunks_queued_for_download flag is correctly set and reset in reader.
+
+    This test verifies that:
+    1. When iterating, _chunks_queued_for_download is enabled during iteration but reset when done
+    2. When accessing by index, _chunks_queued_for_download is never enabled
+    """
+    # Create directories
+    cache_dir = os.path.join(tmpdir, "cache_dir")
+    data_dir = os.path.join(tmpdir, "data_dir")
+    os.makedirs(cache_dir)
+    os.makedirs(data_dir)
+
+    # Create a dataset with 50 items, 10 items per chunk
+    cache = Cache(str(data_dir), chunk_size=10)
+    for i in range(50):
+        cache[i] = i
+    cache.done()
+    cache.merge()
+
+    # Create dataset with appropriate configuration
+    input_dir = f"local:{data_dir}" if local else str(data_dir)
+    dataset = StreamingDataset(input_dir, cache_dir=str(cache_dir) if local else None, shuffle=shuffle)
+    dataset_length = len(dataset)
+    assert dataset_length == 50
+
+    # ACT & ASSERT - Test iterator mode
+    for i, data in enumerate(dataset):
+        assert data is not None
+        if local and i < dataset_length - 1:
+            # In iterator mode with local or remote data, _chunks_queued_for_download should be enabled
+            assert (
+                dataset.cache._reader._chunks_queued_for_download is True
+            ), "_chunks_queued_for_download should be enabled during iteration"
+        else:
+            assert dataset.cache._reader._chunks_queued_for_download is False, (
+                "_chunks_queued_for_download should be disabled when used as local dir without `local:` prefix"
+                " or when iteration is done"
+            )
+    # After iteration, _chunks_queued_for_download should be reset
+    assert dataset.cache._reader._chunks_queued_for_download is False
+
+    # ACT & ASSERT - Test indexed access mode
+    for i in range(dataset_length):
+        data = dataset[i]
+        assert data is not None
+        # In indexed access mode, _chunks_queued_for_download should never be enabled
+        assert dataset.cache._reader._chunks_queued_for_download is False
+
+    assert dataset.cache._reader._chunks_queued_for_download is False