support fsdp2 grad scaler (#997)

JKSenthil · facebook-github-bot · commit a9eff9c41a5f · 2025-05-12T16:55:04.000-07:00
Summary: Pull Request resolved: #997 # Context FSDP required it's own sharded grad scaler. FSDP2 uses the original grad scaler (amp.grad_scaler). See https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md # This Diff 1) Separates fsdp1 and fsdp2 module check functions 2) only uses sharded grad scaler for fsdp1 modules Reviewed By: galrotem Differential Revision: D74410706 fbshipit-source-id: 5454069ae303a31932182ad1b06a9c8920fd5d07
diff --git a/tests/utils/test_precision.py b/tests/utils/test_precision.py
@@ -42,14 +42,16 @@ def test_convert_precision_str_to_dtype_throws(self) -> None:
 
     def test_get_grad_scaler_from_precision(self) -> None:
         grad_scaler = get_grad_scaler_from_precision(
-            torch.float32, is_fsdp_module=False
+            torch.float32, is_fsdp1_module=False
         )
         self.assertIsNone(grad_scaler)
 
         grad_scaler = get_grad_scaler_from_precision(
-            torch.float16, is_fsdp_module=False
+            torch.float16, is_fsdp1_module=False
         )
         self.assertIsInstance(grad_scaler, GradScaler)
 
-        grad_scaler = get_grad_scaler_from_precision(torch.float16, is_fsdp_module=True)
+        grad_scaler = get_grad_scaler_from_precision(
+            torch.float16, is_fsdp1_module=True
+        )
         self.assertIsInstance(grad_scaler, ShardedGradScaler)
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -43,8 +43,8 @@
     GradScaler,
 )
 from torchtnt.utils.prepare_module import (
+    _is_fsdp1_module,
     _is_fsdp2_module,
-    _is_fsdp_module,
     ActivationCheckpointParams,
     FSDPStrategy,
     prepare_fsdp,
@@ -560,7 +560,7 @@ def __init__(
         if self.precision:
             self.grad_scaler = get_grad_scaler_from_precision(
                 self.precision,
-                is_fsdp_module=_is_fsdp_module(self.module),
+                is_fsdp1_module=_is_fsdp1_module(self.module),
             )
 
         self.step_lr_interval = step_lr_interval
diff --git a/torchtnt/utils/precision.py b/torchtnt/utils/precision.py
@@ -38,22 +38,23 @@ def convert_precision_str_to_dtype(precision: str) -> Optional[torch.dtype]:
 
 
 def get_grad_scaler_from_precision(
-    precision: torch.dtype, *, is_fsdp_module: Optional[bool] = False
+    precision: torch.dtype, *, is_fsdp1_module: Optional[bool] = False
 ) -> Optional[GradScaler]:
     """
     Returns the correct grad scaler to use based on the precision and whether
-    or not the model is FSDP.
+    or not the model is FSDP. FSDP required it's own sharded grad scaler. FSDP2 uses
+    the original grad scaler (amp.grad_scaler). See https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md
 
     Args:
         precision: the precision being used
-        is_fsdp_module: whether the grad scaler is for an FSDP module
+        is_fsdp1_module: whether the grad scaler is for an FSDP1 module
 
     Returns:
         The appropriate grad scaler to use, ``None`` if no grad scaler should be used.
     """
 
     if precision == torch.float16:
-        if is_fsdp_module:
+        if is_fsdp1_module:
             from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 
             return ShardedGradScaler()
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -67,7 +67,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
     FullyShardedDataParallel as FSDP,
     StateDictType as _StateDictType,
 )
-from torch.distributed.fsdp._common_utils import _FSDPState
 from torch.distributed.fsdp.api import OptimStateDictConfig, StateDictConfig
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     BackwardPrefetch as _BackwardPrefetch,
@@ -435,7 +434,7 @@ def prepare_fsdp2(
         )
 
     # shard the top level model, so that all params are moved off cpu to gpu
-    if not _is_fsdp_module(module):
+    if not _is_fsdp2_module(module):
         fully_shard(module, **fsdp_kwargs)
 
     # materialized sharded meta weights to device
@@ -515,18 +514,23 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
 
 
 def _is_fsdp_module(module: torch.nn.Module) -> bool:
-    if isinstance(module, FSDP):
-        return True
+    """
+    Checks if a module is wrapped in FSDP or FSDP2
+    """
+    return _is_fsdp1_module(module) or _is_fsdp2_module(module)
 
-    # Also check for composable FSDP API
-    maybe_composable_state = _get_module_state(module)
-    if maybe_composable_state is not None:
-        return isinstance(maybe_composable_state, (_FSDPState, FSDPState))
 
-    return False
+def _is_fsdp1_module(module: torch.nn.Module) -> bool:
+    """
+    Checks if a module is sharded by original FSDP
+    """
+    return isinstance(module, FSDP)
 
 
 def _is_fsdp2_module(module: torch.nn.Module) -> bool:
+    """
+    Checks if a module is sharded by FSDP2
+    """
     maybe_composable_state = _get_module_state(module)
     if maybe_composable_state is not None:
         return isinstance(maybe_composable_state, FSDPState)