Bootcamp Task : Unit Tests Gradient Clipping for Dtensors (pytorch#3253)

Shagun Gupta · facebook-github-bot · commit 7809b144a1ef · 2025-08-08T13:52:33.000-07:00
Summary:

Implemented unit tests to include cases for 2 sharded Dtensors for norm based clipping. All test cases pass.

Differential Revision: D79301301
diff --git a/torchrec/optim/tests/test_clipping.py b/torchrec/optim/tests/test_clipping.py
@@ -245,19 +245,21 @@ def test_clip_no_gradients_norm_meta_device(
 @unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
 @instantiate_parametrized_tests
 class TestGradientClippingDTensor(DTensorTestBase):
+    """No tests for Replicated DTensors as handled prior to GradientClippingOptimizer"""
+
     def _get_params_to_pg(
         self, params: List[DTensor]
     ) -> Dict[DTensor, List[ProcessGroup]]:
         return {param: [param.device_mesh.get_group()] for param in params}
 
     @with_comms
     @parametrize("norm_type", ("inf", 1, 2))
-    def test_dtensor_clip_all_gradients_norm(
+    def test_tensor_and_sharded_dtensor_clip_all_gradients_norm(
         self, norm_type: Union[float, str]
     ) -> None:
         """
         Test to ensure that the gradient clipping optimizer clips gradients
-        correctly with mixed DTensor and tensor by comparing gradients to its
+        correctly with mixed sharded DTensor and tensor by comparing gradients to its
         torch.tensor counterpart.
 
         Note that clipping for DTensor may require communication.
@@ -286,7 +288,7 @@ def test_dtensor_clip_all_gradients_norm(
         ref_param_2.grad = torch.tensor([20.0, 30.0, 15.0], device=self.device_type)
         ref_gradient_clipping_optimizer.step()
 
-        # create gradient clipping optimizer containing both DTensor and tensor
+        # create gradient clipping optimizer containing sharded DTensor and tensor
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
         param_1 = distribute_tensor(
             torch.tensor([1.0, 2.0, 3.0], requires_grad=True, device=self.device_type),
@@ -336,3 +338,96 @@ def test_dtensor_clip_all_gradients_norm(
                     ref_param.grad,
                     f"Expect gradient to be the same. However, found {param_grad=}, {ref_param.grad=}",
                 )
+
+    @with_comms
+    @parametrize("norm_type", ("inf", 1, 2))
+    def test_multiple_sharded_dtensors_clip_all_gradients_norm(
+        self, norm_type: Union[float, str]
+    ) -> None:
+        """
+        Test to ensure that the gradient clipping optimizer clips gradients
+        correctly with multiple sharded DTensors by comparing gradients to their
+        torch.tensor counterpart.
+
+        Note that clipping for DTensor may require communication.
+        """
+
+        # create gradient clipping optimizer containing no dtensor for reference
+        ref_param_1 = torch.nn.Parameter(
+            torch.tensor([1.0, 2.0, 3.0], device=self.device_type)
+        )
+        ref_param_2 = torch.nn.Parameter(
+            torch.tensor([4.0, 5.0, 6.0], device=self.device_type)
+        )
+        ref_keyed_optimizer = DummyKeyedOptimizer(
+            {"param_1": ref_param_1, "param_2": ref_param_2},
+            {},
+            [{"params": [ref_param_1, ref_param_2]}],
+        )
+        ref_gradient_clipping_optimizer = GradientClippingOptimizer(
+            optimizer=ref_keyed_optimizer,
+            clipping=GradientClipping.NORM,
+            max_gradient=10.0,
+            norm_type=norm_type,
+        )
+        ref_gradient_clipping_optimizer.zero_grad()
+        ref_param_1.grad = torch.tensor([12.0, 15.0, 18.0], device=self.device_type)
+        ref_param_2.grad = torch.tensor([20.0, 30.0, 15.0], device=self.device_type)
+        ref_gradient_clipping_optimizer.step()
+
+        # create gradient clipping optimizer containing 2 shareded DTensors
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        param_1 = distribute_tensor(
+            torch.tensor([1.0, 2.0, 3.0], requires_grad=True, device=self.device_type),
+            device_mesh,
+            [Shard(0)],
+        )
+        param_2 = distribute_tensor(
+            torch.tensor([4.0, 5.0, 6.0], requires_grad=True, device=self.device_type),
+            device_mesh,
+            [Shard(0)],
+        )
+        param_to_pgs = self._get_params_to_pg([param_1, param_2])
+        keyed_optimizer = DummyKeyedOptimizer(
+            {"dtensor_param_1": param_1, "dtensor_param_2": param_2},
+            {},
+            [{"params": [param_1, param_2]}],
+        )
+        gradient_clipping_optimizer = GradientClippingOptimizer(
+            optimizer=keyed_optimizer,
+            clipping=GradientClipping.NORM,
+            max_gradient=10.0,
+            norm_type=norm_type,
+            enable_global_grad_clip=True,
+            param_to_pgs=param_to_pgs,  # pyre-ignore[6]
+        )
+        gradient_clipping_optimizer.zero_grad()
+        param_1.grad = distribute_tensor(
+            torch.tensor([12.0, 15.0, 18.0], device=self.device_type),
+            device_mesh,
+            [Shard(0)],
+        )
+        param_2.grad = distribute_tensor(
+            torch.tensor([20.0, 30.0, 15.0], device=self.device_type),
+            device_mesh,
+            [Shard(0)],
+        )
+        gradient_clipping_optimizer.step()
+
+        for param_group, ref_param_group in zip(
+            gradient_clipping_optimizer.param_groups,
+            ref_gradient_clipping_optimizer.param_groups,
+        ):
+            for param, ref_param in zip(
+                param_group["params"], ref_param_group["params"]
+            ):
+                param_grad = (
+                    param.grad.full_tensor()  # pyre-ignore[16]
+                    if isinstance(param, DTensor)
+                    else param.grad
+                )
+                self.assertEqual(
+                    param_grad,
+                    ref_param.grad,
+                    f"Expect gradient to be the same. However, found {param_grad=}, {ref_param.grad=}",
+                )