update clip grad norm for fsdp2

JKSenthil · facebook-github-bot · commit be1ed63b3f31 · 2025-02-27T00:38:02.000-08:00
Reviewed By: vdogaru

Differential Revision: D70268715

fbshipit-source-id: 5f252597a1ff11f61ed692850b8946bfd3758906
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -879,19 +879,13 @@ def _update_weights(self, state: State) -> Optional[torch.Tensor]:
         total_grad_norm = None
         # gradient norm clipping
         if clip_grad_norm:
-            if _is_fsdp_module(module):
-                if isinstance(module, FSDP):
-                    with get_timing_context(
-                        state, f"{self.__class__.__name__}.clip_grad_norm"
-                    ):
-                        total_grad_norm = module.clip_grad_norm_(
-                            max_norm=clip_grad_norm
-                        )
-                else:
-                    raise RuntimeError(
-                        "Composable FSDP clip_grad_norm is not yet implemented: https://github.com/pytorch/pytorch/issues/97271"
-                    )
+            if isinstance(module, FSDP):
+                with get_timing_context(
+                    state, f"{self.__class__.__name__}.clip_grad_norm"
+                ):
+                    total_grad_norm = module.clip_grad_norm_(max_norm=clip_grad_norm)
             else:
+                # strategy=None, DDP, and FSDP2 will work with this
                 with get_timing_context(
                     state, f"{self.__class__.__name__}.clip_grad_norm"
                 ):