feat(fr) StreamingDataset: Fault Tolerance v2 1/n (#19196)

tchaton · thomas · web-flow · commit c989a97aa196 · 2023-12-21T17:01:23.000Z
Co-authored-by: thomas &lt;thomas@thomass-MacBook-Pro.local&gt;
diff --git a/src/lightning/data/streaming/__init__.py b/src/lightning/data/streaming/__init__.py
@@ -12,14 +12,18 @@
 # limitations under the License.
 
 from lightning.data.streaming.cache import Cache
+from lightning.data.streaming.combined import CombinedStreamingDataset
 from lightning.data.streaming.data_processor import DataChunkRecipe, DataProcessor, DataTransformRecipe
+from lightning.data.streaming.dataloader import StreamingDataLoader
 from lightning.data.streaming.dataset import StreamingDataset
 from lightning.data.streaming.item_loader import TokensLoader
 
 __all__ = [
     "Cache",
     "DataProcessor",
     "StreamingDataset",
+    "CombinedStreamingDataset",
+    "StreamingDataLoader",
     "DataTransformRecipe",
     "DataChunkRecipe",
     "TokensLoader",
diff --git a/src/lightning/data/streaming/combined.py b/src/lightning/data/streaming/combined.py
@@ -0,0 +1,91 @@
+# Copyright The Lightning AI team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from typing import Any, Dict, Iterator, List, Optional, Sequence
+
+from torch.utils.data import IterableDataset
+
+from lightning.data.streaming.dataset import StreamingDataset
+
+
+class CombinedStreamingDataset(IterableDataset):
+    """The `CombinedStreamingDataset` enables to stream data from multiple StreamingDataset with the sampling ratio of
+    your choice.
+
+    Addtionally, the `CombinedStreamingDataset` keeps track of the number of
+    samples fetched to enable resumability of the datasets.
+
+    """
+
+    def __init__(
+        self, datasets: List[StreamingDataset], seed: int = 42, weights: Optional[Sequence[float]] = None
+    ) -> None:
+        self._seed = seed
+        self._datasets = datasets
+        self._weights = weights
+        num_datasets = len(datasets)
+
+        if weights is None:
+            # Inversely weighted based on length
+            self._weights = [1 / float(num_datasets)] * num_datasets
+        else:
+            self._weights = [w / sum(weights) for w in weights]
+
+        self._iterator: Optional[_CombinedDatasetIterator] = None
+
+    def __iter__(self) -> Iterator[Any]:
+        assert self._weights
+        self._iterator = _CombinedDatasetIterator(self._datasets, self._seed, self._weights)
+        return self._iterator
+
+    def state_dict(self, num_workers: int, batch_size: int) -> Dict[str, Any]:
+        if self._iterator is None:
+            return {}
+        return self._iterator.state_dict(num_workers, batch_size)
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        if len(state_dict) != len(self._datasets):
+            raise RuntimeError(f"The provided state doesn't match the current number of datasets: {self._datasets}.")
+
+        for dataset_idx, dataset in enumerate(self._datasets):
+            if str(dataset_idx) not in state_dict:
+                raise RuntimeError(f"The provided state doesn't contain the index {dataset_idx}.")
+
+            dataset.load_state_dict(state_dict[str(dataset_idx)])
+
+
+class _CombinedDatasetIterator(Iterator):
+    def __init__(self, datasets: List[StreamingDataset], seed: int, weights: Sequence[float]) -> None:
+        self._datasets = datasets
+        self._dataset_iters = [iter(dataset) for dataset in datasets]
+        self._dataset_indexes = list(range(len(datasets)))
+        self._num_samples_yielded = [0 for _ in range(len(datasets))]
+        self._weights = weights
+        self._rng = random.Random(seed)
+
+    def __next__(self) -> Any:
+        # randomly select a dataset index
+        (dataset_index,) = self._rng.choices(self._dataset_indexes, weights=self._weights, k=1)
+
+        # keep track the sample was fetched
+        self._num_samples_yielded[dataset_index] += 1
+
+        # return a new sample
+        return next(self._dataset_iters[dataset_index])
+
+    def state_dict(self, num_workers: int = 0, batch_size: int = 1) -> Dict[str, Any]:
+        return {
+            str(dataset_idx): dataset.state_dict(self._num_samples_yielded[dataset_idx], num_workers, batch_size)
+            for dataset_idx, dataset in enumerate(self._datasets)
+        }
diff --git a/src/lightning/data/streaming/dataloader.py b/src/lightning/data/streaming/dataloader.py
@@ -16,7 +16,7 @@
 import logging
 import os
 from importlib import reload
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from torch.utils.data import Dataset, IterableDataset
@@ -32,7 +32,9 @@
 from torch.utils.data.sampler import BatchSampler, Sampler
 
 from lightning.data.streaming import Cache
+from lightning.data.streaming.combined import CombinedStreamingDataset
 from lightning.data.streaming.constants import _DEFAULT_CHUNK_BYTES, _TORCH_GREATER_EQUAL_2_1_0, _VIZ_TRACKER_AVAILABLE
+from lightning.data.streaming.dataset import StreamingDataset
 from lightning.data.streaming.sampler import CacheBatchSampler
 from lightning.data.utilities.env import _DistributedEnv
 
@@ -248,7 +250,7 @@ def _next_data(self) -> Any:
             raise e
 
 
-class StreamingDataLoader(DataLoader):
+class CacheDataLoader(DataLoader):
     __doc__ = DataLoader.__doc__
 
     def __init__(
@@ -271,16 +273,16 @@ def __init__(
     ) -> None:
         if sampler:
             raise ValueError(
-                "The StreamingDataLoader relies on its own internal sampler. Passing a sampler isn't supported."
+                "The CacheDataLoader relies on its own internal sampler. Passing a sampler isn't supported."
             )
 
         if batch_sampler:
             raise ValueError(
-                "The StreamingDataLoader relies on its own internal sampler. Passing a batch_sampler isn't supported."
+                "The CacheDataLoader relies on its own internal sampler. Passing a batch_sampler isn't supported."
             )
 
         if isinstance(dataset, IterableDataset):
-            raise ValueError("Only map-based dataset are supported by the StreamingDataLoader for now.")
+            raise ValueError("Only map-based dataset are supported by the CacheDataLoader for now.")
 
         if profile and not _VIZ_TRACKER_AVAILABLE:
             raise ModuleNotFoundError("To enable DataLoader profiling, run `pip install viztracer`.")
@@ -294,7 +296,7 @@ def __init__(
 
         if len(cache_list) == 0:
             if cache_dir is None:
-                raise ValueError("You should provide a `cache_dir` filepath to the StreamingDataLoader.")
+                raise ValueError("You should provide a `cache_dir` filepath to the CacheDataLoader.")
 
             dataset = CacheDataset(dataset, cache_dir, chunk_bytes, batch_size, compression)
             cache = dataset._cache
@@ -337,3 +339,55 @@ def _get_iterator(self) -> "_BaseDataLoaderIter":
             return _SingleProcessDataLoaderIterPatch(self)
         self.check_worker_number_rationality()
         return _MultiProcessingDataLoaderIterPatch(self)
+
+
+class StreamingDataLoader(DataLoader):
+    """The `StreamingDataLoader` keeps track of the number of samples fetched in order to enable resumability of the
+    dataset."""
+
+    __doc__ = DataLoader.__doc__
+
+    def __init__(
+        self,
+        dataset: Union[StreamingDataset, CombinedStreamingDataset],
+        *args: Any,
+        batch_size: int = 1,
+        num_workers: int = 0,
+        **kwargs: Any,
+    ) -> None:  # pyright: ignore
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.num_samples_yielded = 0
+        super().__init__(dataset, *args, batch_size=batch_size, num_workers=num_workers, **kwargs)  # type: ignore
+
+    def __iter__(self) -> Any:
+        if isinstance(self.dataset, StreamingDataset):
+            assert self.batch_size
+            self.num_samples_yielded = 0
+            for batch in super().__iter__():
+                self.num_samples_yielded += self.batch_size
+                yield batch
+        else:
+            yield from super().__iter__()
+
+    def state_dict(self) -> Optional[Dict[str, Any]]:
+        if isinstance(self.dataset, StreamingDataset):
+            assert self.batch_size
+            env = _DistributedEnv.detect()
+            num_samples = self.num_samples_yielded * env.world_size
+            return self.dataset.state_dict(num_samples, self.num_workers, self.batch_size)
+        return self.dataset.state_dict(self.num_workers, self.batch_size)
+
+    def load_state_dict(self, obj: Dict[str, Any]) -> None:
+        """Load a dict containing training state (called from non-worker process).
+
+        This is called on each copy of the dataset when resuming.
+
+        Args:
+            obj (Dict[str, Any]): The state.
+
+        """
+        if isinstance(self.dataset, (StreamingDataset, CombinedStreamingDataset)):
+            self.dataset.load_state_dict(obj)
+        else:
+            raise RuntimeError("The provided dataset should be a `StreamingDataset` or a `CombinedStreamingDataset`.")
diff --git a/src/lightning/data/streaming/dataset.py b/src/lightning/data/streaming/dataset.py
@@ -302,7 +302,7 @@ def _checkpoint(self, chunk_index: int) -> None:
 
         self.last_time = time()
 
-    def state_dict(self) -> Dict[str, Any]:
+    def state_dict(self, num_samples_yielded: int = 0, num_workers: int = 0, batch_size: int = 1) -> Dict[str, Any]:
         if _is_in_dataloader_worker():
             raise RuntimeError("The method `state_dict` should only be called in the main process.")
 
diff --git a/tests/tests_data/streaming/test_cache.py b/tests/tests_data/streaming/test_cache.py
@@ -20,7 +20,7 @@
 import torch
 from lightning import seed_everything
 from lightning.data.streaming import Cache
-from lightning.data.streaming.dataloader import StreamingDataLoader
+from lightning.data.streaming.dataloader import CacheDataLoader
 from lightning.data.streaming.dataset import StreamingDataset
 from lightning.data.streaming.item_loader import TokensLoader
 from lightning.data.streaming.serializers import Serializer
@@ -72,7 +72,7 @@ def _cache_for_image_dataset(num_workers, tmpdir, fabric=None):
 
     cache = Cache(cache_dir, chunk_size=10)
     dataset = ImageDataset(tmpdir, cache, dataset_size, 10)
-    dataloader = StreamingDataLoader(dataset, num_workers=num_workers, batch_size=4)
+    dataloader = CacheDataLoader(dataset, num_workers=num_workers, batch_size=4)
 
     for _ in dataloader:
         pass
@@ -92,15 +92,15 @@ def _cache_for_image_dataset(num_workers, tmpdir, fabric=None):
 
     if distributed_env.world_size == 1:
         indexes = []
-        dataloader = StreamingDataLoader(dataset, num_workers=num_workers, batch_size=4)
+        dataloader = CacheDataLoader(dataset, num_workers=num_workers, batch_size=4)
         for batch in dataloader:
             if batch:
                 indexes.extend(batch["index"].numpy().tolist())
         assert len(indexes) == dataset_size
 
     seed_everything(42)
 
-    dataloader = StreamingDataLoader(dataset, num_workers=num_workers, batch_size=4, shuffle=True)
+    dataloader = CacheDataLoader(dataset, num_workers=num_workers, batch_size=4, shuffle=True)
     dataloader_iter = iter(dataloader)
 
     indexes = []
@@ -194,7 +194,7 @@ def test_cache_with_auto_wrapping(tmpdir):
     os.makedirs(os.path.join(tmpdir, "cache_1"), exist_ok=True)
 
     dataset = RandomDataset(64, 64)
-    dataloader = StreamingDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_1"), chunk_bytes=2 << 12)
+    dataloader = CacheDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_1"), chunk_bytes=2 << 12)
     for batch in dataloader:
         assert isinstance(batch, torch.Tensor)
     assert sorted(os.listdir(os.path.join(tmpdir, "cache_1"))) == [
@@ -217,7 +217,7 @@ def __len__(self) -> int:
 
     os.makedirs(os.path.join(tmpdir, "cache_2"), exist_ok=True)
     dataset = RandomDatasetAtRuntime(64, 64)
-    dataloader = StreamingDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_2"), chunk_bytes=2 << 12)
+    dataloader = CacheDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_2"), chunk_bytes=2 << 12)
     with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):
         for batch in dataloader:
             pass
diff --git a/tests/tests_data/streaming/test_combined.py b/tests/tests_data/streaming/test_combined.py
diff --git a/tests/tests_data/streaming/test_dataloader.py b/tests/tests_data/streaming/test_dataloader.py