Improve Streaming Dataset API (#18882)

tchaton · awaelchli · thomas · lantiga · commit cb06f0922702 · 2023-11-06T10:21:58.000-05:00
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: thomas <thomas@thomass-MacBook-Pro.local> (cherry picked from commit c1437cc)
diff --git a/src/lightning/data/streaming/dataset.py b/src/lightning/data/streaming/dataset.py
@@ -20,7 +20,7 @@
 from lightning.data.streaming import Cache
 from lightning.data.streaming.item_loader import BaseItemLoader
 from lightning.data.streaming.sampler import ChunkedIndex
-from lightning.data.streaming.shuffle import FullShuffle, NoShuffle, Shuffle, TruncatedShuffle
+from lightning.data.streaming.shuffle import FullShuffle, NoShuffle, Shuffle
 
 
 class StreamingDataset(IterableDataset):
@@ -32,7 +32,8 @@ def __init__(
         version: Optional[Union[int, Literal["latest"]]] = "latest",
         cache_dir: Optional[str] = None,
         item_loader: Optional[BaseItemLoader] = None,
-        shuffle: Union[bool, Literal["truncated", "full"]] = "truncated",
+        shuffle: bool = False,
+        drop_last: bool = False,
         seed: int = 42,
     ) -> None:
         """The streaming dataset can be used once your data have been optimised using the DatasetOptimiser class.
@@ -43,10 +44,15 @@ def __init__(
             cache_dir: The cache dir where the data would be stored.
             item_loader: The logic to load an item from a chunk.
             shuffle: Whether to shuffle the data.
+            drop_last: If `True`, drops the last items to ensure that
+                all processes/workers return the same amount of data.
             seed: Random seed for shuffling.
 
         """
         super().__init__()
+        if not isinstance(shuffle, bool):
+            raise ValueError(f"Shuffle should be a boolean. Found {shuffle}")
+
         self.cache = Cache(name=name, version=version, cache_dir=cache_dir, item_loader=item_loader, chunk_bytes=1)
 
         self.cache._reader._try_load_config()
@@ -56,18 +62,10 @@ def __init__(
 
         self.distributed_env = _DistributedEnv.detect()
 
-        if isinstance(shuffle, bool):
-            _shuffle = TruncatedShuffle(self.cache, seed) if shuffle else NoShuffle(self.cache, seed)
-
-        if isinstance(shuffle, str):
-            if shuffle == "truncated":
-                _shuffle = TruncatedShuffle(self.cache, seed)
-            elif shuffle == "full":
-                _shuffle = FullShuffle(self.cache, seed)
-            else:
-                raise ValueError(f"The provided shuffle doesn't exist. Found {shuffle}")
-
-        self.shuffle: Shuffle = _shuffle
+        self.shuffle: Shuffle = (
+            FullShuffle(self.cache, seed, drop_last) if shuffle else NoShuffle(self.cache, seed, drop_last)
+        )
+        self.drop_last = drop_last
         self.worker_env: Optional[_WorkerEnv] = None
         self.worker_chunks: List[int] = []
         self.worker_intervals: List[List[int]] = []
@@ -84,7 +82,7 @@ def __len__(self) -> int:
         return self.shuffle.get_len(self.distributed_env, self.current_epoch)
 
     def __iter__(self) -> "StreamingDataset":
-        chunks_per_replica, intervals_per_replica = self.shuffle.get_chunks_and_intervals_per_process(
+        chunks_per_replica, intervals_per_replica = self.shuffle.get_chunks_and_intervals_per_ranks(
             self.distributed_env, self.current_epoch
         )
         current_chunks = chunks_per_replica[self.distributed_env.global_rank % self.distributed_env.world_size]
diff --git a/src/lightning/data/streaming/shuffle.py b/src/lightning/data/streaming/shuffle.py
@@ -24,17 +24,27 @@
 class Shuffle(ABC):
     """Shuffle describe how to distribute chunked datasets across processes and workers."""
 
-    def __init__(self, cache: Cache, seed: int):
+    def __init__(self, cache: Cache, seed: int, drop_last: bool):
         self.cache = cache
         self.seed = seed
+        self.drop_last = drop_last
         self.random_state = None
 
-    @abstractmethod
+    @lru_cache(maxsize=10)
     def get_len(self, distributed_env: _DistributedEnv, current_epoch: int) -> int:
-        pass
+        _, intervals_per_ranks = self.get_chunks_and_intervals_per_ranks(distributed_env, current_epoch)
+
+        if self.drop_last:
+            items_per_process = [
+                sum((interval[-1] - interval[0]) for interval in intervals) for intervals in intervals_per_ranks
+            ]
+            min_items_per_process = min(items_per_process)
+            return min_items_per_process
+
+        return sum((interval[-1] - interval[0]) for interval in intervals_per_ranks[distributed_env.global_rank])
 
     @abstractmethod
-    def get_chunks_and_intervals_per_process(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
+    def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
         pass
 
     @abstractmethod
@@ -43,79 +53,29 @@ def __call__(self, array: np.ndarray) -> List[int]:
 
 
 class NoShuffle(Shuffle):
-    """NoShuffle doesn't shuffle the items and ensure all the processes receive the same number of items."""
+    """NoShuffle doesn't shuffle the items and ensure all the processes receive the same number of items if drop_last
+    is True."""
 
     @lru_cache(maxsize=10)
-    def get_len(self, distributed_env: _DistributedEnv, current_epoch: int) -> int:
-        _, intervals_per_process = self.get_chunks_and_intervals_per_process(distributed_env, current_epoch)
-        min_items_per_process = min(
-            [sum([(interval[-1] - interval[0]) for interval in intervals]) for intervals in intervals_per_process]
-        )
-        return min_items_per_process
-
-    @lru_cache(maxsize=10)
-    def get_chunks_and_intervals_per_process(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
+    def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
         self.random_state = np.random.RandomState(seed=self.seed + current_epoch)  # type: ignore
         chunk_intervals = self.cache.get_chunk_intervals()
         indexes = list(range(len(chunk_intervals)))
         shuffled_chunk_intervals = np.asarray(chunk_intervals)[indexes]
 
-        chunks_per_process: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
-        intervals_per_process: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
+        chunks_per_ranks: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
+        intervals_per_ranks: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
         for index, (chunk_index, chunk_interval) in enumerate(zip(indexes, shuffled_chunk_intervals)):
             replica_index = index % distributed_env.world_size
-            chunks_per_process[replica_index].append(chunk_index)
-            intervals_per_process[replica_index].append(chunk_interval)
+            chunks_per_ranks[replica_index].append(chunk_index)
+            intervals_per_ranks[replica_index].append(chunk_interval)
 
-        return chunks_per_process, intervals_per_process
+        return chunks_per_ranks, intervals_per_ranks
 
     def __call__(self, array: np.ndarray) -> List[int]:
         return array.tolist()
 
 
-class TruncatedShuffle(Shuffle):
-    """TruncatedShuffle shuffles the chunks and associates them to the ranks.
-
-    As the number of items in a chunk varies, it is possible for a rank to end up with more or less items.
-
-    To ensure the same fixed dataset length for all ranks, we compute the minimum number of items across all ranks.
-
-    For the ranks with more items than the minimum, the remaining items are dropped.
-
-    Note: This is the fastest sampling strategy but at the cost of losing items.
-
-    """
-
-    @lru_cache(maxsize=10)
-    def get_len(self, distributed_env: _DistributedEnv, current_epoch: int) -> int:
-        _, intervals_per_process = self.get_chunks_and_intervals_per_process(distributed_env, current_epoch)
-        min_items_per_process = min(
-            [sum([(interval[-1] - interval[0]) for interval in intervals]) for intervals in intervals_per_process]
-        )
-        return min_items_per_process
-
-    @lru_cache(maxsize=10)
-    def get_chunks_and_intervals_per_process(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
-        self.random_state = np.random.RandomState(seed=self.seed + current_epoch)  # type: ignore
-        chunk_intervals = self.cache.get_chunk_intervals()
-        indexes = range(len(chunk_intervals))
-        shuffled_indexes = self.random_state.permutation(indexes)
-        shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes]
-
-        chunks_per_process: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
-        intervals_per_process: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
-        for index, (chunk_index, chunk_interval) in enumerate(zip(shuffled_indexes, shuffled_chunk_intervals)):
-            replica_index = index % distributed_env.world_size
-            chunks_per_process[replica_index].append(chunk_index)
-            intervals_per_process[replica_index].append(chunk_interval)
-
-        return chunks_per_process, intervals_per_process
-
-    def __call__(self, array: np.ndarray) -> List[int]:
-        assert self.random_state
-        return self.random_state.permutation(array).tolist()
-
-
 class FullShuffle(Shuffle):
     """FullShuffle shuffles the chunks and associates them to the ranks.
 
@@ -135,36 +95,40 @@ class FullShuffle(Shuffle):
     """
 
     @lru_cache(maxsize=10)
-    def get_len(self, distributed_env: _DistributedEnv, current_epoch: int) -> int:
-        _, intervals_per_process = self.get_chunks_and_intervals_per_process(distributed_env, current_epoch)
-        min_items_per_process = min([sum([(i[-1] - i[0]) for i in intervals]) for intervals in intervals_per_process])
-        return min_items_per_process
-
-    @lru_cache(maxsize=10)
-    def get_chunks_and_intervals_per_process(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
+    def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
         self.random_state = np.random.RandomState(seed=self.seed + current_epoch)  # type: ignore
+
+        # 1. Get the intervals
         chunk_intervals = self.cache.get_chunk_intervals()
+
+        # 2. Shuffle them
         indexes = range(len(chunk_intervals))
         shuffled_indexes = self.random_state.permutation(indexes)
         shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes]
 
+        # 3. Compute the items budget of each rank
         num_items = sum([(interval[-1] - interval[0]) for interval in chunk_intervals])
-        num_items_per_process: List[int] = [
-            num_items // distributed_env.world_size for _ in range(distributed_env.world_size)
+        num_items_per_ranks: List[int] = [
+            num_items // distributed_env.world_size + num_items % distributed_env.world_size
+            if rank == distributed_env.world_size - 1 and not self.drop_last
+            else num_items // distributed_env.world_size
+            for rank in range(distributed_env.world_size)
         ]
-        chunks_per_process: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
-        intervals_per_process: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
+        chunks_per_ranks: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
+        intervals_per_ranks: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
+
+        # 4. Assign the chunk & intervals to each rank
         for chunk_index, chunk_interval in zip(shuffled_indexes, shuffled_chunk_intervals):
-            process_index = 0
+            rank = 0
 
             while True:
-                if process_index == len(num_items_per_process):
+                if rank == len(num_items_per_ranks):
                     break
 
-                items_left_to_assign = num_items_per_process[process_index]
+                items_left_to_assign = num_items_per_ranks[rank]
 
                 if items_left_to_assign == 0:
-                    process_index += 1
+                    rank += 1
                     continue
 
                 items_in_chunk = chunk_interval[-1] - chunk_interval[0]
@@ -173,19 +137,19 @@ def get_chunks_and_intervals_per_process(self, distributed_env: _DistributedEnv,
                     break
 
                 if items_in_chunk > items_left_to_assign:
-                    chunks_per_process[process_index].append(chunk_index)
+                    chunks_per_ranks[rank].append(chunk_index)
                     begin, end = chunk_interval
-                    intervals_per_process[process_index].append([begin, begin + items_left_to_assign])
-                    chunk_interval = (begin + items_left_to_assign + 1, end)
-                    num_items_per_process[process_index] = 0
-                    process_index += 1
+                    intervals_per_ranks[rank].append([begin, begin + items_left_to_assign])
+                    chunk_interval = (begin + items_left_to_assign, end)
+                    num_items_per_ranks[rank] = 0
+                    rank += 1
                 else:
-                    chunks_per_process[process_index].append(chunk_index)
-                    intervals_per_process[process_index].append(chunk_interval)
-                    num_items_per_process[process_index] -= items_in_chunk
+                    chunks_per_ranks[rank].append(chunk_index)
+                    intervals_per_ranks[rank].append(chunk_interval)
+                    num_items_per_ranks[rank] -= items_in_chunk
                     break
 
-        return chunks_per_process, intervals_per_process
+        return chunks_per_ranks, intervals_per_ranks
 
     def __call__(self, array: np.ndarray) -> List[int]:
         assert self.random_state
diff --git a/tests/tests_data/streaming/test_cache.py b/tests/tests_data/streaming/test_cache.py
@@ -162,12 +162,15 @@ def test_cache_with_simple_format(tmpdir):
 
     cache = Cache(cache_dir, chunk_bytes=90)
 
+    # you encode data
     for i in range(100):
         cache[i] = i
 
+    # I am done, write the index ...
     cache.done()
     cache.merge()
 
+    # please, decode the data for me.
     for i in range(100):
         assert i == cache[i]
 
diff --git a/tests/tests_data/streaming/test_dataset.py b/tests/tests_data/streaming/test_dataset.py