add loss_backward_retain_graph to __init__() (#856)

stephenyan1231 · facebook-github-bot · commit 5dad8d3638f9 · 2024-06-26T21:13:58.000-07:00
Summary: Pull Request resolved: #856 Expose **retain_graph** kwarg in **loss.backward()** by adding a new argument **loss_backward_retain_graph** to **AutoUnit.__init__()** Reviewed By: JKSenthil Differential Revision: D58901158 fbshipit-source-id: cdc051e15d1831cbc69b9cab7e92c0f6adb72c67
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -434,6 +434,10 @@ class AutoUnit(
         activation_checkpoint_params: params for enabling activation checkpointing
         training: if True, the optimizer and optionally LR scheduler will be created after the class is initialized.
         enable_compiled_autograd: if True, `compiled_autograd` will be used to compile the backward, this is an experimental flag.
+        loss_backward_retain_graph:  If ``None`` or ``False``, the graph used to compute
+        the grads will be freed during loss backward pass. Note that in nearly all cases setting
+        this option to True is not needed and often can be worked around
+        in a much more efficient way.
 
     Note:
         Certain strategies, like :class:`~torchtnt.utils.prepare_module.FSDPStrategy` also support mixed precision as an argument, so can be configured through that class as well.
@@ -463,6 +467,7 @@ def __init__(
         activation_checkpoint_params: Optional[ActivationCheckpointParams] = None,
         training: bool = True,
         enable_compiled_autograd: bool = False,
+        loss_backward_retain_graph: Optional[bool] = None,
     ) -> None:
         super().__init__(
             module=module,
@@ -526,6 +531,7 @@ def __init__(
 
         self.enable_compiled_autograd = enable_compiled_autograd
         self.training = training
+        self.loss_backward_retain_graph = loss_backward_retain_graph
 
         self.optimizer: Optional[torch.optim.Optimizer] = None
         self.lr_scheduler: Optional[TLRScheduler] = None
@@ -620,12 +626,14 @@ def maybe_enable_compiled_autograd(
                     with get_timing_context(
                         state, f"{self.__class__.__name__}.backward"
                     ):
-                        scaled_loss.backward()
+                        scaled_loss.backward(
+                            retain_graph=self.loss_backward_retain_graph
+                        )
                 else:
                     with get_timing_context(
                         state, f"{self.__class__.__name__}.backward"
                     ):
-                        loss.backward()
+                        loss.backward(retain_graph=self.loss_backward_retain_graph)
 
         total_grad_norm = None
         if should_update_weights: