write conv1d decomposition (pytorch#163080)

yushangdi · pytorchmergebot · commit 4660e38e5a95 · 2025-09-17T19:22:38.000Z
In Unified Runtime, we cannot have any fallback ops (for now). Not all conv1d ops can avoid fallbacks now, so we write a decomposition for it. it's not registered to the default decomposition table as currently only executorch/unified runtime needs it. But it might benefit inductor as well because conv2d can generate triton kernels while there's no triton codegen for conv1d. I don't know if the conv2d triton kernel will have better perf compared to aten::conv1d, so it's not registered by default yet. To register it, one just needs to do `import torch._decomp as decomp;decomp.register_decomposition(torch.ops.aten.conv1d.default, conv1d_to_conv2d)` Pull Request resolved: pytorch#163080 Approved by: https://github.com/angelayi
diff --git a/test/test_decomp.py b/test/test_decomp.py
@@ -1343,6 +1343,55 @@ def test_aten_core_operators(self):
         core_aten_ops = useful_decomps - core_decomps
         self.assertExpected("".join(sorted(op.name() + "\n" for op in core_aten_ops)))
 
+    def test_conv1d_decomposition(self):
+        from torch._inductor.decomposition import conv1d_to_conv2d
+
+        def check_case(
+            N=2,
+            C_in=3,
+            C_out=5,
+            L=37,
+            K=5,
+            stride=2,
+            padding=3,
+            dilation=1,
+            groups=1,
+            dtype=torch.float32,
+            device="cpu",
+        ):
+            torch.manual_seed(0)
+            x = torch.randn(N, C_in, L, dtype=dtype, device=device)
+            w = torch.randn(C_out, C_in // groups, K, dtype=dtype, device=device)
+            b = torch.randn(C_out, dtype=dtype, device=device)
+
+            ref = torch.ops.aten.conv1d.default(
+                x,
+                w,
+                b,
+                stride=[stride],
+                padding=[padding],
+                dilation=[dilation],
+                groups=groups,
+            )
+            got = conv1d_to_conv2d(
+                x,
+                w,
+                b,
+                stride=[stride],
+                padding=[padding],
+                dilation=[dilation],
+                groups=groups,
+            )
+            self.assertTrue(torch.allclose(ref, got, atol=1e-5, rtol=1e-5))
+
+        # A few cases
+        check_case()  # default
+        check_case(stride=1, padding=0, K=3)
+        check_case(stride=3, padding=4, K=7)
+        check_case(dilation=2, padding=6, K=5)  # dilation
+        check_case(groups=1, C_in=8, C_out=12)  # groups=1 bigger
+        check_case(groups=2, C_in=8, C_out=12)  # grouped conv
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
@@ -1172,3 +1172,45 @@ def repeat_interleave_Tensor(
     return torch.searchsorted(
         cumsum, pos, out_int32=(repeat.dtype == torch.int32), right=True
     )
+
+
+# intentionally not regiestered
+def conv1d_to_conv2d(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    stride: tuple[int] = (1,),
+    padding: tuple[int] = (0,),
+    dilation: tuple[int] = (1,),
+    groups: int = 1,
+) -> torch.Tensor:
+    # Shapes:
+    # input:  (N, C_in, L_in)
+    # weight: (C_out, C_in // groups, K)
+    # bias:   (C_out,)
+    assert input.dim() == 3 and weight.dim() == 3, (
+        "Expect (N,C_in,L) and (C_out,C_in//groups,K)"
+    )
+
+    stride = stride[0]
+    padding = padding[0]
+    dilation = dilation[0]
+
+    # Unsqueeze to make input 2D: (N,C,L) -> (N,C,L,1)
+    input_2d = input.unsqueeze(-1)
+    # Unsqueeze kernel: (C_out,C_in/groups,K) -> (C_out,C_in/groups,K,1)
+    weight_2d = weight.unsqueeze(-1)
+
+    # Call conv2d with adjusted args
+    out_2d = aten.conv2d.default(
+        input_2d,
+        weight_2d,
+        bias,
+        stride=(stride, 1),
+        padding=(padding, 0),
+        dilation=(dilation, 1),
+        groups=groups,
+    )
+
+    # Squeeze dummy dimension back out: (N,C_out,L_out,1) -> (N,C_out,L_out)
+    return out_2d.squeeze(-1)