fix: dataset replay last batch (#300)

raresgaia123 · web-flow · commit 419205385c5d · 2025-10-27T10:17:53.000-07:00
When using request generator with less sending rate, the server send and server recv were out of sync. With previous implementation, dataset would return None when there were no batches. For that reason, due to longer wait time given by the slower rate server finished processing last batches before getting the info that these are done. The fix includes early detection of the last batch and send that info with the batches, so the server would know which batch is the last one.
diff --git a/infscale/execution/pipeline.py b/infscale/execution/pipeline.py
@@ -273,10 +273,9 @@ async def _server_send(self, router: Router):
         self._end_of_send = False
 
         async def _inner_send(batches: list[torch.Tensor | None]) -> None:
-            for batch in batches:
-                if batch is None:
+            for batch, is_last in batches:
+                if is_last:
                     self._end_of_send = True
-                    break
 
                 await self._wait_tx_permission()
 
diff --git a/infscale/module/dataset.py b/infscale/module/dataset.py
@@ -106,6 +106,7 @@ def collate_fn(examples):
         mmd.trace_inputs = trace_inputs
 
         self.model_group = mmd.model_group
+        self._batch_list: list[Tensor | None] = []
 
     def configure(
         self, micro_batch_size: int, device: torch.device, in_memory: bool, replay: int
@@ -132,6 +133,8 @@ def _inner_send_b2d(batch):
 
         if not self._in_memory:
             self._send_batch_to_device = _inner_send_b2d
+            batch = next(self.data_iter)
+            self._batch_list.append(batch)
             return
 
         # do nothing in case of in-memory loading
@@ -144,11 +147,12 @@ def _inner_send_b2d(batch):
             self.batches.append(batch)
 
         self.data_iter = iter(self.batches)
+        batch = next(self.data_iter)
+        self._batch_list.append(batch)
 
     def _handle_dataset_playback(self) -> Tensor | None:
         if self._replay == 0:
             return None
-
         # this ensures self._replay decreases to zero or
         # stays as -1 (infinite)
         self._replay = max(self._replay - 1, -1)
@@ -160,20 +164,22 @@ def _handle_dataset_playback(self) -> Tensor | None:
 
         return next(self.data_iter)
 
-    def next_batch(self) -> Tensor | None:
-        """Return next data tensor.
-
-        Once all the data is consumed, it returns None.
-        """
+    def next_batch(self) -> tuple[Tensor, bool]:
+        """Return next data tensor and bool if last bach."""
         try:
             batch = next(self.data_iter)
+            self._batch_list.append(batch)
         except StopIteration:
             batch = self._handle_dataset_playback()
+            self._batch_list.append(batch)
 
+        batch = self._batch_list.pop(0)
         # noop for in-memory case; otherwise, load batch to a correct device
         self._send_batch_to_device(batch)
 
-        return batch
+        is_last = self._batch_list[0] is None
+
+        return batch, is_last
 
     @staticmethod
     def create_image_dataset(
diff --git a/infscale/request/generator.py b/infscale/request/generator.py
@@ -61,7 +61,7 @@ async def get(self) -> list[Tensor | None]:
 class DefaultGenerator(Generator):
     """DefaultGenerator class."""
 
-    async def get(self) -> list[Tensor | None]:
+    async def get(self) -> list[tuple[Tensor, bool]]:
         """Return one batch of requests as a list.
 
         initialize() method must be called once before calling this method.
@@ -99,10 +99,10 @@ async def _generate(self) -> None:
         await self._gen_evt.wait()
 
         while True:
-            batch = self._dataset.next_batch()
-            await self._queue.put(batch)
+            batch, is_last = self._dataset.next_batch()
+            await self._queue.put((batch, is_last))
 
-            if batch is None:
+            if is_last:
                 break
 
             self._mc.update(self._seqno)
@@ -114,7 +114,7 @@ async def _generate(self) -> None:
     def _compute_iat(self):
         return np.random.exponential(scale=1 / self._batch_rate)
 
-    async def get(self) -> list[Tensor | None]:
+    async def get(self) -> list[tuple[Tensor, bool]]:
         """Return one batch of requests.
 
         initialize() method must be called once before calling this method.
@@ -124,8 +124,8 @@ async def get(self) -> list[Tensor | None]:
         batches = []
         while True:
             # this guarantees at least one batch of requests is returned
-            batch = await self._queue.get()
-            batches.append(batch)
+            batch, is_last = await self._queue.get()
+            batches.append((batch, is_last))
 
             if self._queue.empty():
                 break