enable be (#2851)

Xiaolong Wang · facebook-github-bot · commit 89b12e6db6c2 · 2025-04-03T16:14:22.000-07:00
Summary: Pull Request resolved: #2851 setup using lsr as input; need to change once input is ready; on exhaustiveness: - dataloader requires `drop_incomplete=False` - disable fullsync and enable exhaustiveness - torchrec train_pipeline fix which didn't account for exhaustiveness need to follow up FI behavior when in BE: - if cache is never updated (0 in standalone BE or <itrn when training finishes), update - if cache is updated to date, use it Differential Revision: D71827944 Privacy Context Container: L1292699 fbshipit-source-id: 1460a9673845decdeecc53d0fc292dd360125146
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -139,17 +139,24 @@ def __init__(
         )
         self._cur_batch: Optional[In] = None
         self._connected = False
+        self._data_iter_stopped = False
 
     def _connect(self, dataloader_iter: Iterator[In]) -> None:
         cur_batch = next(dataloader_iter)
         self._cur_batch = cur_batch
-        with self._stream_context(self._memcpy_stream):
-            self._cur_batch = _to_device(cur_batch, self._device, non_blocking=True)
+        if cur_batch is not None:
+            with self._stream_context(self._memcpy_stream):
+                self._cur_batch = _to_device(cur_batch, self._device, non_blocking=True)
         self._connected = True
 
-    def _next_batch(self, dataloader_iter: Iterator[In]) -> In:
+    def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
         with record_function("## next_batch ##"):
-            next_batch = next(dataloader_iter)
+            try:
+                next_batch = next(dataloader_iter)
+            except StopIteration:
+                self._data_iter_stopped = True
+                return None
+
         return next_batch
 
     def _wait_for_batch(self, cur_batch: In) -> None:
@@ -168,18 +175,26 @@ def _copy_batch_to_gpu(self, cur_batch: In) -> None:
     def progress(self, dataloader_iter: Iterator[In]) -> Out:
         if not self._connected:
             self._connect(dataloader_iter)
+        if self._data_iter_stopped:
+            raise StopIteration()
 
-        # Fetch next batch
+        # Fetch next batch, if depleted, raise at start of next progress
         next_batch = self._next_batch(dataloader_iter)
         cur_batch = self._cur_batch
-        assert cur_batch is not None
+
+        # for exhaustive data iter, some ranks will first depletes data,
+        # but we still need progress the train pipeline for other ranks;
+        # cur_batch could be None
 
         if self._model.training:
             with record_function("## zero_grad ##"):
                 self._optimizer.zero_grad()
 
-        self._wait_for_batch(cur_batch)
+        if cur_batch is not None:
+            self._wait_for_batch(cur_batch)
 
+        # model will need to handle if cur_batch is empty; this is needed if there's
+        # communicative ops
         with record_function("## forward ##"):
             (losses, output) = self._model(cur_batch)
 
@@ -188,7 +203,8 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
 
         # Copy the next batch to GPU
         self._cur_batch = cur_batch = next_batch
-        self._copy_batch_to_gpu(cur_batch)
+        if cur_batch is not None:
+            self._copy_batch_to_gpu(cur_batch)
 
         # Update
         if self._model.training: