[PP] Allow intermediate nodes in ZB to have multiple grads (pytorch#159084)

H-Huang · pytorchmergebot · commit ede6186c866f · 2025-07-27T19:16:51.000Z
Fixes a ZB regression (https://github.com/pytorch/torchtitan/actions/runs/16478292562/job/46585646792) Previously we only allowed an intermediate node to have 1 gradient. Recently a torchtitan ZB test started failing and I tracked to back to FusedRMSNorm grad_fn having two values `(grad, None)` (see pytorch#153666) and it started breaking our ZB tests. This PR allows `stage_backward_weight` intermediate nodes to have multiple grads (it sums them together or if the grad value is None, then ignores it). Here is an example where the backward would have two grad values (gI1, gI2): ```python class Func(torch.autograd.Function): @staticmethod def forward(ctx, x): return x, 2 @staticmethod def backward(ctx, gI1, gI2): assert gI2 is None return gI1 ``` Pull Request resolved: pytorch#159084 Approved by: https://github.com/tianyu-l
diff --git a/test/distributed/pipelining/test_backward.py b/test/distributed/pipelining/test_backward.py
@@ -183,6 +183,44 @@ def test_stage_backward_weight_multiple_iters(self, device):
                 print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                 raise
 
+    def test_stage_backward_weight_grad_validation(self, device):
+        test_cases = [
+            (
+                "size >= 2",
+                lambda: [
+                    (
+                        torch.randn(batch_size, d_hid, device=device),
+                        torch.randn(batch_size, d_hid, device=device),
+                    )
+                ],
+            ),
+            ("size = 1", lambda: [(torch.randn(batch_size, d_hid, device=device),)]),
+            (
+                "1 grad, 1 None",
+                lambda: [(torch.randn(batch_size, d_hid, device=device), None)],
+            ),
+        ]
+
+        for description, mock_grads_factory in test_cases:
+            with self.subTest(description=description):
+                mod = MLPModule(d_hid).to(device)
+                x = torch.randn(batch_size, d_hid, device=device)
+                x.requires_grad_(True)
+                out = mod(x)
+                loss = torch.sum(out)
+                dinputs, param_groups = stage_backward_input(
+                    stage_outputs_or_loss=[loss],
+                    output_grads=None,
+                    input_values=[x],
+                    weights=mod.parameters(),
+                )
+
+                # Set up mock grads
+                for param_group in param_groups:
+                    param_group["grads"] = mock_grads_factory()
+
+                stage_backward_weight(mod.parameters(), param_groups)
+
 
 devices = ["cpu", "cuda", "hpu", "xpu"]
 instantiate_device_type_tests(StageBackwardTests, globals(), only_for=devices)
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
@@ -235,11 +235,17 @@ def stage_backward_weight(
         weight_grads.append(weight.grad)
 
     for param_group in param_groups:
-        # TODO: Handle case where intermediate can have multiple outputs
-        intermediate_edges = tuple(
-            GradientEdge(i, 0) for i in param_group["intermediates"]
-        )
-        weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
+        valid_edges = []
+        valid_grad_outputs: list[torch.Tensor] = []
+
+        for grads_tuple, intermediate in zip(
+            param_group["grads"], param_group["intermediates"]
+        ):
+            non_none_grads = [g for g in grads_tuple if g is not None]
+            if non_none_grads:
+                summed_grad = sum(non_none_grads)
+                valid_edges.append(GradientEdge(intermediate, 0))
+                valid_grad_outputs.append(summed_grad)
 
         # Break a reference cycle caused inside stage_backward_input->get_hook->hook
         # The summarized cycle is:
@@ -248,25 +254,25 @@ def stage_backward_weight(
         # We need to keep intermediates alive up until backward_weight, but we can free it now.
         del param_group["intermediates"]
 
-        assert all(len(g) == 1 for g in param_group["grads"])
-        # [NEW!] Able to pass a GradientEdge to autograd.grad as output
-        # We do not need to retain_graph because... guarantee no overlap?
-        # print("trying to execute: ", intermediate_edges, weights_edges)
-        dweights = torch.autograd.grad(
-            intermediate_edges,
-            weights_edges,
-            grad_outputs=sum(param_group["grads"], tuple()),
-            retain_graph=retain_graph,
-        )
-        # release grad memory early after use
-        del param_group["grads"]
+        if valid_edges:  # Only call autograd.grad if we have valid gradients
+            # [NEW!] Able to pass a GradientEdge to autograd.grad as output
+            weights_edges = tuple(GradientEdge(w, 0) for w in param_group["params"])
+            dweights = torch.autograd.grad(
+                valid_edges,
+                weights_edges,
+                grad_outputs=valid_grad_outputs,
+                retain_graph=retain_graph,
+            )
 
-        for grad_acc, dw in zip(param_group["params"], dweights):
-            weight, index = grad_acc_to_weight[grad_acc]
-            if weight.grad is None:
-                weight.grad = dw
-            else:
-                weight.grad += dw
+            # release grad memory early after use
+            del param_group["grads"]
+
+            for grad_acc, dw in zip(param_group["params"], dweights):
+                weight, index = grad_acc_to_weight[grad_acc]
+                if weight.grad is None:
+                    weight.grad = dw
+                else:
+                    weight.grad += dw
     # return grads in the original order weights were provided in
     return tuple(weight_grads)