Collecting DTensor in case of sharding (#1021)

RushabhM · facebook-github-bot · commit d501c4886e24 · 2025-07-30T16:12:58.000-07:00
Summary: Pull Request resolved: #1021 Reviewed By: JKSenthil Differential Revision: D79264984 fbshipit-source-id: 9c18a394e3662b0d62269c6c93b890058fe251db
diff --git a/torchtnt/framework/auto_unit.py b/torchtnt/framework/auto_unit.py
@@ -28,6 +28,7 @@
 import torch
 from pyre_extensions import none_throws
 from torch.distributed.fsdp import FSDPModule, FullyShardedDataParallel as FSDP
+from torch.distributed.tensor import DTensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim.swa_utils import SWALR
 from torchtnt.framework._unit_utils import _step_requires_iterator
@@ -900,6 +901,9 @@ def _update_weights(self, state: State) -> Optional[torch.Tensor]:
                         parameters=module.parameters(),
                         max_norm=clip_grad_norm,
                     )
+                    # If sharded, collect the DTensor here
+                    if isinstance(total_grad_norm, DTensor):
+                        total_grad_norm = total_grad_norm.full_tensor()
 
         # gradient value clipping
         if clip_grad_value: