enable prefetch in autounit (#875)

JKSenthil · facebook-github-bot · commit f7e53c1c53d1 · 2024-07-31T20:49:19.000-07:00
Summary: Pull Request resolved: #875 # Context Users cannot disable prefetch in auto unit # This diff Adds `enable_prefetch` flag to auto unit which can be used to disable if needed. Reviewed By: galrotem Differential Revision: D59980065 fbshipit-source-id: 2a2f2f802b8d084d495a961839773f89c8d82022
diff --git a/tests/framework/test_auto_unit.py b/tests/framework/test_auto_unit.py
@@ -642,6 +642,17 @@ def _assert_next_batch_dicts(
             },
         )
 
+    def test_enable_prefetch(self) -> None:
+        data = [1, 2, 3]
+        auto_unit = DummyAutoUnit(module=torch.nn.Linear(2, 2), enable_prefetch=True)
+
+        _ = auto_unit._get_next_batch(get_dummy_train_state(), iter(data))
+        self.assertEqual(auto_unit._phase_to_next_batch[ActivePhase.TRAIN], 2)
+
+        auto_unit = DummyAutoUnit(module=torch.nn.Linear(2, 2), enable_prefetch=False)
+        _ = auto_unit._get_next_batch(get_dummy_train_state(), iter(data))
+        self.assertIsNone(auto_unit._phase_to_next_batch[ActivePhase.TRAIN])
+
 
 Batch = Tuple[torch.Tensor, torch.Tensor]
 
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -168,6 +168,7 @@ def __init__(
         precision: Optional[Union[str, torch.dtype]] = None,
         detect_anomaly: Optional[bool] = None,
         torch_compile_params: Optional[TorchCompileParams] = None,
+        enable_prefetch: bool = True,
     ) -> None:
         super().__init__()
 
@@ -189,7 +190,9 @@ def __init__(
 
         # cuda stream to use for moving data to device
         self._prefetch_stream: Optional[torch.cuda.streams.Stream] = (
-            torch.cuda.Stream() if self.device.type == "cuda" else None
+            torch.cuda.Stream()
+            if (self.device.type == "cuda" and enable_prefetch)
+            else None
         )
         # dict mapping phase to whether the next batch which has been prefetched for that phase and is ready to be used
         self._phase_to_next_batch: dict[ActivePhase, Optional[TData]] = {
@@ -206,6 +209,7 @@ def __init__(
         }
         # whether the current batch is the last train batch
         self._is_last_batch: bool = False
+        self._enable_prefetch = enable_prefetch
 
     def move_data_to_device(
         self, state: State, data: TData, non_blocking: bool
@@ -253,6 +257,10 @@ def _prefetch_next_batch(self, state: State, data_iter: Iterator[TData]) -> None
             )
 
     def _get_next_batch(self, state: State, data: Iterator[TData]) -> TData:
+        if not self._enable_prefetch:
+            batch = next(data)
+            return self.move_data_to_device(state, batch, non_blocking=False)
+
         active_phase = state.active_phase
         if not self._phase_to_prefetched[active_phase]:
             self._prefetch_next_batch(state, data)
@@ -293,6 +301,7 @@ def __init__(
         precision: Optional[Union[str, torch.dtype]] = None,
         torch_compile_params: Optional[TorchCompileParams] = None,
         detect_anomaly: Optional[bool] = None,
+        enable_prefetch: bool = False,
     ) -> None:
         """
         AutoPredictUnit is a convenience for users who are running inference and would like to have certain features handled for them, such as:
@@ -325,6 +334,7 @@ def __init__(
             precision=precision,
             torch_compile_params=torch_compile_params,
             detect_anomaly=detect_anomaly,
+            enable_prefetch=enable_prefetch,
         )
         self.module: torch.nn.Module = prepare_module(
             module,
@@ -435,9 +445,10 @@ class AutoUnit(
         training: if True, the optimizer and optionally LR scheduler will be created after the class is initialized.
         enable_compiled_autograd: if True, `compiled_autograd` will be used to compile the backward, this is an experimental flag.
         loss_backward_retain_graph:  If ``None`` or ``False``, the graph used to compute
-        the grads will be freed during loss backward pass. Note that in nearly all cases setting
-        this option to True is not needed and often can be worked around
-        in a much more efficient way.
+            the grads will be freed during loss backward pass. Note that in nearly all cases setting
+            this option to True is not needed and often can be worked around
+            in a much more efficient way.
+        enable_prefetch: if True, the data will be prefetched to the device before the next batch is loaded
 
     Note:
         Certain strategies, like :class:`~torchtnt.utils.prepare_module.FSDPStrategy` also support mixed precision as an argument, so can be configured through that class as well.
@@ -468,13 +479,15 @@ def __init__(
         training: bool = True,
         enable_compiled_autograd: bool = False,
         loss_backward_retain_graph: Optional[bool] = None,
+        enable_prefetch: bool = True,
     ) -> None:
         super().__init__(
             module=module,
             device=device,
             precision=precision,
             detect_anomaly=detect_anomaly,
             torch_compile_params=torch_compile_params,
+            enable_prefetch=enable_prefetch,
         )
 
         if not gradient_accumulation_steps > 0: