StreamingDataset: Add intra node shuffling to accelerate second epoch (#19296)

tchaton · web-flow · commit 75510dd9f856 · 2024-01-19T17:08:32.000Z
diff --git a/requirements/data/data.txt b/requirements/data/data.txt
@@ -5,3 +5,4 @@ lightning-utilities >=0.8.0, <0.10.0
 # to be able to include also PL 2.0 and preserve `>` needed for CI min version bypass
 torch >0.14.0, <2.2.0
 lightning-cloud
+filelock
diff --git a/src/lightning/data/streaming/combined.py b/src/lightning/data/streaming/combined.py
@@ -46,7 +46,7 @@ def __init__(
 
     def __len__(self) -> int:
         assert self._weights
-        return int(sum(w * len(d) for w, d in zip(self._weights, self._datasets)))
+        return int(min([1 / w * len(d) for w, d in zip(self._weights, self._datasets) if w > 0]))
 
     def __iter__(self) -> Iterator[Any]:
         assert self._weights
diff --git a/src/lightning/data/streaming/dataset.py b/src/lightning/data/streaming/dataset.py
@@ -29,7 +29,7 @@
 from lightning.data.streaming.sampler import ChunkedIndex
 from lightning.data.streaming.serializers import Serializer
 from lightning.data.streaming.shuffle import FullShuffle, NoShuffle, Shuffle
-from lightning.data.utilities.env import Environment, _DistributedEnv, _WorkerEnv
+from lightning.data.utilities.env import _DistributedEnv, _WorkerEnv
 
 
 class StreamingDataset(IterableDataset):
@@ -91,13 +91,9 @@ def __init__(
         self._state_dict: Optional[Dict[str, Any]] = None
 
     def _create_cache(self, worker_env: _WorkerEnv) -> Cache:
-        env = Environment(dist_env=self.distributed_env, worker_env=worker_env)
-
         if _should_replace_path(self.input_dir.path):
-            # FIXME: Remove the `shard_rank` from the cache_path to enable reloading chunks for the second epoch
-            # without paying the cost of re-download
             cache_path = _try_create_cache_dir(
-                input_dir=self.input_dir.path if self.input_dir.path else self.input_dir.url, shard_rank=env.shard_rank
+                input_dir=self.input_dir.path if self.input_dir.path else self.input_dir.url
             )
             if cache_path is not None:
                 self.input_dir.path = cache_path
@@ -362,13 +358,13 @@ def _validate_state_dict(self) -> None:
             )
 
 
-def _try_create_cache_dir(input_dir: Optional[str], shard_rank: int = 0) -> Optional[str]:
+def _try_create_cache_dir(input_dir: Optional[str]) -> Optional[str]:
     hash_object = hashlib.md5((input_dir or "").encode())
     if "LIGHTNING_CLUSTER_ID" not in os.environ or "LIGHTNING_CLOUD_PROJECT_ID" not in os.environ:
-        cache_dir = os.path.join(_DEFAULT_CACHE_DIR, hash_object.hexdigest(), str(shard_rank))
+        cache_dir = os.path.join(_DEFAULT_CACHE_DIR, hash_object.hexdigest())
         os.makedirs(cache_dir, exist_ok=True)
         return cache_dir
-    cache_dir = os.path.join("/cache", "chunks", hash_object.hexdigest(), str(shard_rank))
+    cache_dir = os.path.join("/cache", "chunks", hash_object.hexdigest())
     os.makedirs(cache_dir, exist_ok=True)
     return cache_dir
 
diff --git a/src/lightning/data/streaming/downloader.py b/src/lightning/data/streaming/downloader.py
@@ -16,6 +16,8 @@
 from typing import Any, Dict, List
 from urllib import parse
 
+from filelock import FileLock, Timeout
+
 from lightning.data.streaming.client import S3Client
 
 
@@ -50,21 +52,28 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
 
         extra_args: Dict[str, Any] = {}
 
-        # Issue: https://github.com/boto/boto3/issues/3113
-        self._client.client.download_file(
-            obj.netloc,
-            obj.path.lstrip("/"),
-            local_filepath,
-            ExtraArgs=extra_args,
-            Config=TransferConfig(use_threads=False),
-        )
+        try:
+            with FileLock(local_filepath + ".lock", timeout=1):
+                if not os.path.exists(local_filepath):
+                    # Issue: https://github.com/boto/boto3/issues/3113
+                    self._client.client.download_file(
+                        obj.netloc,
+                        obj.path.lstrip("/"),
+                        local_filepath,
+                        ExtraArgs=extra_args,
+                        Config=TransferConfig(use_threads=False),
+                    )
+        except Timeout:
+            # another process is responsible to download that file, continue
+            pass
 
 
 class LocalDownloader(Downloader):
     def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         if not os.path.exists(remote_filepath):
             raise FileNotFoundError(f"The provided remote_path doesn't exist: {remote_filepath}")
-        if remote_filepath != local_filepath:
+
+        if remote_filepath != local_filepath and not os.path.exists(local_filepath):
             shutil.copy(remote_filepath, local_filepath)
 
 
diff --git a/src/lightning/data/streaming/item_loader.py b/src/lightning/data/streaming/item_loader.py
@@ -181,22 +181,23 @@ def pre_load_chunk(self, chunk_index: int, chunk_filepath: str) -> None:
         if chunk_filepath not in self._chunk_filepaths:
             self._chunk_filepaths[chunk_filepath] = True
 
-        self._load_chunk(chunk_index, chunk_filepath)
+        if os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size > 0:
+            self._load_chunk(chunk_index, chunk_filepath)
 
     def load_item_from_chunk(self, index: int, chunk_index: int, chunk_filepath: str, begin: int) -> torch.Tensor:
         if chunk_filepath in self._chunk_filepaths and not os.path.isfile(chunk_filepath):
             del self._chunk_filepaths[chunk_filepath]
 
         if chunk_filepath not in self._chunk_filepaths:
-            first_exists = exists = os.path.exists(chunk_filepath)
+            first_exists = exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size > 0
 
             while not exists:
                 sleep(0.1)
-                exists = os.path.exists(chunk_filepath)
+                exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size > 0
 
             # Wait to avoid any corruption when the file appears
             if not first_exists:
-                sleep(0.001)
+                sleep(0.1)
 
             self._chunk_filepaths[chunk_filepath] = True
 
diff --git a/src/lightning/data/streaming/reader.py b/src/lightning/data/streaming/reader.py
@@ -51,6 +51,7 @@ def __init__(
         self,
         config: ChunksConfig,
         item_loader: BaseItemLoader,
+        distributed_env: _DistributedEnv,
         max_cache_size: Optional[int] = None,
         max_pre_download: int = 2,
     ) -> None:
@@ -59,15 +60,17 @@ def __init__(
         self._item_loader = item_loader
         self._max_pre_download = max_pre_download
         self._pre_download_counter = 0
+        self._distributed_env = distributed_env
 
         self._chunks_index_to_be_deleted: List[int] = []
         self._max_cache_size = max_cache_size
         self._parent_cache_dir = os.path.dirname(self._config._cache_dir)
         self._to_download_queue: multiprocessing.Queue = multiprocessing.Queue()
         self._to_delete_queue: multiprocessing.Queue = multiprocessing.Queue()
 
-        # FIXME: This should be divided by the number of nodes to provide a more granular support with scaling out
-        self._delete_chunks_when_processed = self._config.num_bytes > max_cache_size if max_cache_size else False
+        # Check whether a dataset slice fits on the node
+        num_bytes_per_nodes = self._config.num_bytes // self._distributed_env.num_nodes
+        self._delete_chunks_when_processed = num_bytes_per_nodes > max_cache_size if max_cache_size else False
         self._has_exited = False
 
     def download(self, chunk_indexes: List[int]) -> None:
@@ -229,7 +232,9 @@ def read(self, index: ChunkedIndex) -> Any:
         if self._config and self._config._remote_dir:
             # Create and start the prepare chunks thread
             if self._prepare_thread is None and self._config:
-                self._prepare_thread = PrepareChunksThread(self._config, self._item_loader, self._max_cache_size)
+                self._prepare_thread = PrepareChunksThread(
+                    self._config, self._item_loader, self._distributed_env, self._max_cache_size
+                )
                 self._prepare_thread.start()
                 if index.chunk_indexes:
                     self._prepare_thread.download(index.chunk_indexes)
diff --git a/src/lightning/data/streaming/shuffle.py b/src/lightning/data/streaming/shuffle.py
@@ -97,55 +97,108 @@ def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, c
         # 2. Shuffle them
         indexes = range(len(chunk_intervals))
 
-        # FIXME: Shuffling should be done only within the nodes to benefit
-        # from cache if the dataset doesn't fit on the node.
-        shuffled_indexes = np.random.RandomState(seed=self.seed + current_epoch).permutation(indexes)
-        shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes]
+        # If we have multiple nodes, the seed_shift is constant here.
+        # Here is why. When you are running epoch 1, we need to shuffle the chunks
+        # and associate to each rank. This is done there.
+        # When you are running epoch 2 or more, we need to keep the same shuffling
+        # than in epoch 1 because shuffle a second time within the node.
+        # This is done slighyly down this function.
+        seed_shift = 1 if distributed_env.num_nodes > 1 else current_epoch
+        shuffled_indexes = np.random.RandomState(seed=self.seed + seed_shift).permutation(indexes)
+        shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes].tolist()
 
         # 3. Compute the items budget of each rank
-        num_items = sum([(interval[-1] - interval[0]) for interval in chunk_intervals])
-        num_items_per_ranks: List[int] = [
-            num_items // distributed_env.world_size + num_items % distributed_env.world_size
-            if rank == distributed_env.world_size - 1 and not self.drop_last
-            else num_items // distributed_env.world_size
-            for rank in range(distributed_env.world_size)
-        ]
-        chunks_per_ranks: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
-        intervals_per_ranks: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
-
-        # 4. Assign the chunk & intervals to each rank
-        for chunk_index, chunk_interval in zip(shuffled_indexes, shuffled_chunk_intervals):
-            rank = 0
-
-            while True:
-                if rank == len(num_items_per_ranks):
-                    break
-
-                items_left_to_assign = num_items_per_ranks[rank]
+        chunks_per_ranks, intervals_per_ranks = _associate_chunks_and_internals_to_ranks(
+            distributed_env, shuffled_indexes, shuffled_chunk_intervals, self.drop_last
+        )
 
-                if items_left_to_assign == 0:
-                    rank += 1
-                    continue
+        # For the first epoch, no need of further shuffling
+        if current_epoch == 1 or distributed_env.num_nodes == 1:
+            return chunks_per_ranks, intervals_per_ranks
 
-                items_in_chunk = chunk_interval[-1] - chunk_interval[0]
+        # Perform shuffle within the nodes to avoid cache miss.
+        # Note: It is possible for the overlapping chunks to change due to the changing order.
+        shuffled_indexes = _intra_node_chunk_shuffle(distributed_env, chunks_per_ranks, self.seed, current_epoch)
+        shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes].tolist()
 
-                if items_in_chunk == 0:
-                    break
-
-                if items_in_chunk > items_left_to_assign:
-                    chunks_per_ranks[rank].append(chunk_index)
-                    begin, end = chunk_interval
-                    intervals_per_ranks[rank].append([begin, begin + items_left_to_assign])
-                    chunk_interval = (begin + items_left_to_assign, end)
-                    num_items_per_ranks[rank] = 0
-                    rank += 1
-                else:
-                    chunks_per_ranks[rank].append(chunk_index)
-                    intervals_per_ranks[rank].append(chunk_interval)
-                    num_items_per_ranks[rank] -= items_in_chunk
-                    break
+        chunks_per_ranks, intervals_per_ranks = _associate_chunks_and_internals_to_ranks(
+            distributed_env, shuffled_indexes, shuffled_chunk_intervals, self.drop_last
+        )
 
         return chunks_per_ranks, intervals_per_ranks
 
     def __call__(self, array: np.ndarray, num_chunks: int, current_epoch: int, chunk_index: int) -> List[int]:
         return np.random.RandomState([self.seed, num_chunks * current_epoch, chunk_index]).permutation(array).tolist()
+
+
+def _intra_node_chunk_shuffle(
+    distributed_env: _DistributedEnv,
+    chunks_per_ranks: List[List[int]],
+    seed: int,
+    current_epoch: int,
+) -> List[int]:
+    chunk_indexes_per_nodes: Any = [[] for _ in range(distributed_env.num_nodes)]
+    for rank, chunks_per_rank in enumerate(chunks_per_ranks):
+        chunk_indexes_per_nodes[0 if distributed_env.num_nodes == 1 else rank // distributed_env.num_nodes].extend(
+            chunks_per_rank
+        )
+
+    # shuffle the chunks associated to the node
+    for i in range(len(chunk_indexes_per_nodes)):
+        # permute the indexes within the node
+        chunk_indexes_per_nodes[i] = np.random.RandomState(seed=seed + current_epoch).permutation(
+            chunk_indexes_per_nodes[i]
+        )
+
+    return [index for chunks in chunk_indexes_per_nodes for index in chunks]
+
+
+def _associate_chunks_and_internals_to_ranks(
+    distributed_env: _DistributedEnv,
+    indexes: Any,
+    chunk_intervals: Any,
+    drop_last: bool,
+) -> Tuple[List[List[int]], List[Any]]:
+    num_items = sum([(interval[-1] - interval[0]) for interval in chunk_intervals])
+    num_items_per_ranks: List[int] = [
+        num_items // distributed_env.world_size + num_items % distributed_env.world_size
+        if rank == distributed_env.world_size - 1 and not drop_last
+        else num_items // distributed_env.world_size
+        for rank in range(distributed_env.world_size)
+    ]
+    chunks_per_ranks: List[List[int]] = [[] for _ in range(distributed_env.world_size)]
+    intervals_per_ranks: List[List[List[int]]] = [[] for _ in range(distributed_env.world_size)]
+
+    # 4. Assign the chunk & intervals to each rank
+    for chunk_index, chunk_interval in zip(indexes, chunk_intervals):
+        rank = 0
+
+        while True:
+            if rank == len(num_items_per_ranks):
+                break
+
+            items_left_to_assign = num_items_per_ranks[rank]
+
+            if items_left_to_assign == 0:
+                rank += 1
+                continue
+
+            items_in_chunk = chunk_interval[-1] - chunk_interval[0]
+
+            if items_in_chunk == 0:
+                break
+
+            if items_in_chunk > items_left_to_assign:
+                chunks_per_ranks[rank].append(chunk_index)
+                begin, end = chunk_interval
+                intervals_per_ranks[rank].append([begin, begin + items_left_to_assign])
+                chunk_interval = (begin + items_left_to_assign, end)
+                num_items_per_ranks[rank] = 0
+                rank += 1
+            else:
+                chunks_per_ranks[rank].append(chunk_index)
+                intervals_per_ranks[rank].append(chunk_interval)
+                num_items_per_ranks[rank] -= items_in_chunk
+                break
+
+    return chunks_per_ranks, intervals_per_ranks
diff --git a/src/lightning/data/utilities/env.py b/src/lightning/data/utilities/env.py
@@ -13,9 +13,10 @@ class _DistributedEnv:
 
     """
 
-    def __init__(self, world_size: int, global_rank: int):
+    def __init__(self, world_size: int, global_rank: int, num_nodes: int):
         self.world_size = world_size
         self.global_rank = global_rank
+        self.num_nodes = num_nodes
 
     @classmethod
     def detect(cls) -> "_DistributedEnv":
@@ -37,7 +38,14 @@ def detect(cls) -> "_DistributedEnv":
         if world_size is None or world_size == -1:
             world_size = 1
 
-        return cls(world_size=world_size, global_rank=global_rank)
+        # TODO: Add support for other accelerators
+        num_nodes = (world_size // torch.cuda.device_count()) if torch.cuda.is_available() else 1
+
+        if num_nodes > 1:
+            # validate the world size is divisble by the number of GPUs
+            assert world_size % torch.cuda.device_count() == 0
+
+        return cls(world_size=world_size, global_rank=global_rank, num_nodes=num_nodes)
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}(world_size: {self.world_size}, global_rank: {self.global_rank}\n)"
@@ -113,7 +121,8 @@ def from_args(
                 the current training process
 
         """
-        dist_env = _DistributedEnv(dist_world_size, global_rank)
+        num_nodes = (dist_world_size // torch.cuda.device_count()) if torch.cuda.is_available() else 1
+        dist_env = _DistributedEnv(dist_world_size, global_rank, num_nodes)
         worker_env = _WorkerEnv(num_workers, current_worker_rank)
         return cls(dist_env=dist_env, worker_env=worker_env)
 
diff --git a/tests/tests_data/streaming/test_dataset.py b/tests/tests_data/streaming/test_dataset.py
diff --git a/tests/tests_data/streaming/test_reader.py b/tests/tests_data/streaming/test_reader.py
diff --git a/tests/tests_data/streaming/test_shuffle.py b/tests/tests_data/streaming/test_shuffle.py