[DTensor] Fix convolution ops with bias=None in torch.compile (pytorch#167258)

stmcgovern · pytorchmergebot · commit 8bb11524df8f · 2025-11-18T02:24:39.000Z
Fixes pytorch#167091 DTensor convolution operations crashed when bias=None was passed with torch.compile because the code assumed bias always exists, but the ATen schema defines it as optional (Tensor?). This fix: - Handles None bias_spec in convolution_rules (forward pass) - Handles None bias_shape_opt in convolution_backward_rules - Returns None for grad_bias_spec when bias is None - Extends None output handling to indices 0,1,2 in _sharding_prop.py Added 3 regression tests covering compile mode, backward pass, and nn.Conv2d module API with bias=False. This is related to issue pytorch#159959 and this PR pytorch#165438 that resolves it, overlapping in the` _sharding_prop.py` change. Pull Request resolved: pytorch#167258 Approved by: https://github.com/XilunWu
diff --git a/test/distributed/tensor/test_convolution_ops.py b/test/distributed/tensor/test_convolution_ops.py
@@ -230,6 +230,98 @@ def test_conv3d(self):
         out_dt, out = self._run_single_arg_fwd(model, x, [Shard(0)])
         self.assertEqual(out_dt, out)
 
+    @with_comms
+    def test_conv2d_no_bias_compile(self):
+        """Test Conv2d with bias=False in compile mode (Issue #167091)
+
+        Regression test: Previously this would fail during torch.compile
+        tracing with AssertionError when bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        def conv_fn(x, w):
+            return F.conv2d(x, w, bias=None, padding=1)
+
+        compiled_fn = torch.compile(conv_fn)
+
+        # Create tensors
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = distribute_tensor(w, device_mesh, [Replicate()])
+
+        # Test eager mode for comparison
+        result_eager = conv_fn(x_dt, w_dt)
+
+        # Test compiled mode - this should not crash
+        result_compiled = compiled_fn(x_dt, w_dt)
+
+        # Verify shape is correct (the key regression test)
+        self.assertEqual(result_compiled.shape, torch.Size([1, 8, 5, 5]))
+
+        # Verify numerical correctness
+        torch.testing.assert_close(result_compiled.to_local(), result_eager.to_local())
+
+    @with_comms
+    def test_conv2d_no_bias_backward(self):
+        """Test Conv2d backward pass with bias=False (Issue #167091)
+
+        Regression test: Previously backward pass would fail when
+        grad_bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create tensors with requires_grad
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type, requires_grad=True)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = torch.nn.Parameter(distribute_tensor(w, device_mesh, [Replicate()]))
+
+        # Forward pass
+        result = F.conv2d(x_dt, w_dt, bias=None, padding=1)
+
+        # Backward pass - this should not crash
+        grad_output = torch.randn_like(result)
+        result.backward(grad_output)
+
+        # Check weight gradient exists (the key regression test)
+        self.assertIsNotNone(w_dt.grad)
+        self.assertEqual(w_dt.grad.shape, torch.Size([8, 4, 3, 3]))
+
+    @with_comms
+    def test_conv2d_module_no_bias(self):
+        """Test nn.Conv2d module with bias=False (Issue #167091)
+
+        Regression test: Ensures nn.Conv2d with bias=False works with DTensor.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create model with bias=False
+        model = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False).to(
+            self.device_type
+        )
+        nn.init.ones_(model.weight)
+
+        # Distribute model
+        model_dt = distribute_module(model, device_mesh, _conv_fn)
+
+        # Create input
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+
+        # Forward pass - this should not crash
+        output_dt = model_dt(x_dt)
+
+        # Check outputs shape is correct
+        self.assertEqual(output_dt.shape, torch.Size([1, 8, 5, 5]))
+
+        # Check that model.bias is None
+        self.assertIsNone(model.bias)
+
 
 DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
     DistConvolutionOpsTest,
@@ -238,6 +330,10 @@ def test_conv3d(self):
         "test_conv_backward_none_grad_inp",
         "test_depthwise_convolution",
         "test_downsampling_convolution",
+        # New tests for Issue #167091 - use send/recv via tp_convolution
+        "test_conv2d_no_bias_compile",
+        "test_conv2d_no_bias_backward",
+        "test_conv2d_module_no_bias",
     ],
 )
 
diff --git a/torch/distributed/tensor/_ops/_conv_ops.py b/torch/distributed/tensor/_ops/_conv_ops.py
@@ -26,15 +26,18 @@ def convolution_rules(op_schema: OpSchema) -> OutputSharding:
 
     assert isinstance(input_spec, DTensorSpec)
     assert isinstance(weight_spec, DTensorSpec)
-    assert isinstance(bias_spec, DTensorSpec)
+    # bias_spec can be None (optional parameter in aten.convolution schema)
+    if bias_spec is not None:
+        assert isinstance(bias_spec, DTensorSpec)
     assert input_spec.tensor_meta is not None
     assert weight_spec.tensor_meta is not None
     in_shape = input_spec.tensor_meta.shape
     weight_shape = weight_spec.tensor_meta.shape
-    assert isinstance(stride, list)
-    assert isinstance(padding, list)
-    assert isinstance(dilation, list)
-    assert isinstance(weight_shape, torch.Size)
+    assert isinstance(stride, list), f"stride must be list, got {type(stride)}"
+    assert isinstance(padding, list), f"padding must be list, got {type(padding)}"
+    assert isinstance(dilation, list), f"dilation must be list, got {type(dilation)}"
+    # weight_shape might not be torch.Size in all cases (e.g., SymIntArrayRef during tracing)
+    # so we don't assert its type, just use it
     out_conv_shape = [
         (d + 2 * padding[i] - dilation[i] * (weight_shape[i + 1] - 1) - 1) // stride[i]
         + 1
@@ -82,14 +85,21 @@ def convolution_backward_rules(op_schema: OpSchema) -> OutputSharding:
     assert isinstance(grad_output_spec, DTensorSpec)
     assert isinstance(input_spec, DTensorSpec)
     assert isinstance(weight_spec, DTensorSpec)
-    assert isinstance(bias_shape_opt, list)
+    # bias_shape_opt can be None (optional parameter in aten.convolution_backward schema)
+    if bias_shape_opt is not None:
+        assert isinstance(bias_shape_opt, list)
     assert input_spec.tensor_meta is not None
     weight_tensor_meta = weight_spec.tensor_meta
-    bias_tensor_meta = TensorMeta(
-        torch.Size(bias_shape_opt),
-        (1,),
-        input_spec.tensor_meta.dtype,
-    )
+
+    # Only create bias_tensor_meta if bias_shape_opt is not None
+    if bias_shape_opt is not None:
+        bias_tensor_meta = TensorMeta(
+            torch.Size(bias_shape_opt),
+            (1,),
+            input_spec.tensor_meta.dtype,
+        )
+    else:
+        bias_tensor_meta = None
 
     grad_input_spec = input_spec
     grad_weight_spec = DTensorSpec.from_dim_map(
@@ -98,12 +108,18 @@ def convolution_backward_rules(op_schema: OpSchema) -> OutputSharding:
         [0],
         tensor_meta=weight_tensor_meta,
     )
-    grad_bias_spec = DTensorSpec.from_dim_map(
-        input_spec.mesh,
-        [-1],
-        [0],
-        tensor_meta=bias_tensor_meta,
-    )
+
+    # Only create grad_bias_spec if we have bias_tensor_meta
+    if bias_tensor_meta is not None:
+        grad_bias_spec = DTensorSpec.from_dim_map(
+            input_spec.mesh,
+            [-1],
+            [0],
+            tensor_meta=bias_tensor_meta,
+        )
+    else:
+        grad_bias_spec = None
+
     # TODO: actually the output_mask is not respected here, we should
     # set the corresponding spec to `None` if the output_mask is not `False`
     # for a certain output Tensor. This also applies to the conv handler
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
@@ -275,14 +275,16 @@ def _create_output_spec_with_new_tensor_meta(
                     output_tensor_meta_i = output_tensor_meta[i]
                     if not isinstance(output_tensor_meta_i, TensorMeta):
                         # NOTE: aten.convolution_backward.default is an exception and it
-                        # needs extra handling because the first Tensor in the output
-                        # tuple can be `None` if the input Tensor to convolution op has
-                        # `requires_grad=False` (e.g. convolution layer is the first
-                        # layer in the model). We explicitly allow its corresponding
-                        # TensorMeta to be `None`.
+                        # needs extra handling because any Tensor in the output tuple
+                        # can be `None` depending on the output_mask parameter. This can
+                        # occur during double backpropagation or when certain gradients
+                        # are not needed (e.g., grad_input when input has requires_grad=False,
+                        # grad_weight/grad_bias when weight/bias have requires_grad=False,
+                        # or grad_bias when bias is None). We explicitly allow the
+                        # corresponding TensorMeta to be `None`.
                         if (
                             op == aten.convolution_backward.default
-                            and i == 0
+                            and i in (0, 1, 2)
                             and output_tensor_meta_i is None
                         ):
                             assert isinstance(output_specs, list)