Prevent leaking the thread to the workers (#18891)

tchaton · awaelchli · thomas · lantiga · commit c148282cc7f6 · 2023-11-06T10:21:58.000-05:00
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: thomas <thomas@thomass-MacBook-Pro.local> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 2526c90)
diff --git a/src/lightning/data/streaming/downloader.py b/src/lightning/data/streaming/downloader.py
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import shutil
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Type
 from urllib import parse
@@ -63,8 +64,15 @@ def download_file(cls, remote_filepath: str, local_filepath: str) -> None:
         )
 
 
-# TODO: Add fsspec support
-_DOWNLOADERS = {"s3://": S3Downloader}
+class LocalDownloader(Downloader):
+    @classmethod
+    def download_file(cls, remote_filepath: str, local_filepath: str) -> None:
+        if not os.path.exists(remote_filepath):
+            raise FileNotFoundError("The provided remote_path doesn't exist: {remote_path}")
+        shutil.copy(remote_filepath, local_filepath)
+
+
+_DOWNLOADERS = {"s3://": S3Downloader, "": LocalDownloader}
 
 
 def get_downloader_cls(remote_dir: str) -> Type[Downloader]:
diff --git a/src/lightning/data/streaming/item_loader.py b/src/lightning/data/streaming/item_loader.py
@@ -51,6 +51,9 @@ def load_item_from_chunk(self, index: int, chunk_index: int, chunk_filepath: str
 class PyTreeLoader(BaseItemLoader):
     """The Pytree Loader is the default loader of the Cache object."""
 
+    def __init__(self) -> None:
+        self._chunk_filepaths: Dict[str, bool] = {}
+
     def generate_intervals(self) -> List[Tuple[int, int]]:
         intervals = []
         begin = 0
@@ -64,8 +67,10 @@ def generate_intervals(self) -> List[Tuple[int, int]]:
     def load_item_from_chunk(self, index: int, chunk_index: int, chunk_filepath: str, begin: int) -> bytes:
         offset = (1 + (index - begin) if index >= begin else index + 1) * 4
 
-        while not os.path.exists(chunk_filepath):
-            sleep(0.0001)
+        if chunk_filepath not in self._chunk_filepaths:
+            while not os.path.exists(chunk_filepath):
+                sleep(0.001)
+            self._chunk_filepaths[chunk_filepath] = True
 
         with open(chunk_filepath, "rb", 0) as fp:
             fp.seek(offset)
diff --git a/src/lightning/data/streaming/reader.py b/src/lightning/data/streaming/reader.py
@@ -172,3 +172,8 @@ def get_chunk_intervals(self) -> List[Tuple[int, int]]:
             raise Exception("The reader index isn't defined.")
 
         return self.config.intervals
+
+    def __getstate__(self) -> Dict[str, Any]:
+        state = self.__dict__.copy()
+        state["_prepare_thread"] = None
+        return state
diff --git a/tests/tests_data/streaming/test_dataset.py b/tests/tests_data/streaming/test_dataset.py
@@ -206,3 +206,32 @@ def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir):
     assert len(process_2_1) == 611
 
     assert len([i for i in process_1_1 if i in process_2_1]) == 0
+
+
+def test_streaming_dataset_deepcopy(tmpdir, monkeypatch):
+    seed_everything(42)
+
+    remote_dir = os.path.join(tmpdir, "remote_dir")
+
+    os.makedirs(remote_dir, exist_ok=True)
+
+    cache = Cache(remote_dir, chunk_size=10)
+    for i in range(10):
+        cache[i] = i
+
+    cache.done()
+    cache.merge()
+
+    monkeypatch.setattr(cache_module, "_find_remote_dir", lambda x, y: (str(remote_dir), True))
+
+    dataset = StreamingDataset(name="choco", cache_dir=tmpdir, shuffle=True)
+    assert dataset.cache._reader._prepare_thread is None
+    _ = dataset[0]
+    assert dataset.cache._reader._prepare_thread
+    dataloader = DataLoader(dataset, num_workers=1)
+
+    batches = []
+    for batch in dataloader:
+        batches.append(batch)
+
+    assert len(batches) == 10