Add backward pass support to addmm and gemm operators

tianrengao · web-flow · commit f4bf7cb3c86c · 2025-10-09T19:24:29.000Z
Differential Revision: D84263978 Pull Request resolved: #531
diff --git a/tritonbench/operators/addmm/operator.py b/tritonbench/operators/addmm/operator.py
@@ -20,6 +20,7 @@
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
     BenchmarkOperatorMetrics,
+    Mode,
     PRECISION_DTYPE_MAPPING,
     register_benchmark,
     register_metric,
@@ -81,7 +82,6 @@
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "best_config"]
     DEFAULT_PRECISION = "fp16"
-    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
@@ -171,6 +171,7 @@ def get_input_iter(self) -> Generator:
             if hasattr(self, "dtypes") and self.dtypes:
                 self.tb_args.precision = "bypass"
                 self.dtype = PRECISION_DTYPE_MAPPING[self.dtypes[shape_id]]
+            requires_grad = self.mode in (Mode.BWD, Mode.FWD_BWD)
             if hasattr(self, "strides"):
                 # generate shapes with different strides
                 strides = self.strides[shape_id]
@@ -188,13 +189,13 @@ def get_input_iter(self) -> Generator:
                 original_n = max(n, strides[2][0])
                 a = torch.randn(
                     (m, n), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 mat1 = torch.randn(
                     (original_m, original_k), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 mat2 = torch.randn(
                     (original_k, original_n), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 a = a.as_strided((m, n), strides[0])
                 mat1 = mat1.as_strided((m, k), strides[1])
                 mat2 = mat2.as_strided((k, n), strides[2])
@@ -203,13 +204,13 @@ def get_input_iter(self) -> Generator:
                 m, k, n = shape
                 a = torch.randn(
                     (m, n), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 mat1 = torch.randn(
                     (m, k), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 mat2 = torch.randn(
                     (k, n), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
                 if self.col_major:
                     mat2 = mat2.T.contiguous().T
                 yield a, mat1, mat2
diff --git a/tritonbench/operators/gemm/kernels/matmul.py b/tritonbench/operators/gemm/kernels/matmul.py
@@ -414,6 +414,13 @@ def forward(
         fp8_fast_accum=True,
         output_dtype=None,
     ):
+        # Save tensors for backward
+        ctx.save_for_backward(a, b)
+        ctx.acc_dtype = acc_dtype
+        ctx.input_precision = input_precision
+        ctx.fp8_fast_accum = fp8_fast_accum
+        ctx.output_dtype = output_dtype
+
         return _matmul._call(
             a,
             b,
@@ -423,5 +430,32 @@ def forward(
             output_dtype=output_dtype,
         )
 
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b = ctx.saved_tensors
+        grad_a = grad_b = None
+
+        if ctx.needs_input_grad[0]:
+            grad_a = _matmul._call(
+                grad_output,
+                b.t(),
+                acc_dtype=ctx.acc_dtype,
+                input_precision=ctx.input_precision,
+                fp8_fast_accum=ctx.fp8_fast_accum,
+                output_dtype=None,
+            )
+
+        if ctx.needs_input_grad[1]:
+            grad_b = _matmul._call(
+                a.t(),
+                grad_output,
+                acc_dtype=ctx.acc_dtype,
+                input_precision=ctx.input_precision,
+                fp8_fast_accum=ctx.fp8_fast_accum,
+                output_dtype=None,
+            )
+
+        return grad_a, grad_b, None, None, None, None
+
 
 matmul = _matmul.apply
diff --git a/tritonbench/operators/gemm/operator.py b/tritonbench/operators/gemm/operator.py
@@ -47,6 +47,7 @@ def _tlx_matmul(*args, **kwargs):
     BenchmarkOperator,
     BenchmarkOperatorMetrics,
     llama_shapes,
+    Mode,
     PRECISION_DTYPE_MAPPING,
     register_benchmark,
     register_metric,
@@ -176,7 +177,6 @@ def read_shapes_from_csv(csv_path: str) -> List[List[int]]:
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "speedup", "tflops"]
     DEFAULT_PRECISION = "fp16"
-    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
@@ -543,6 +543,7 @@ def get_input_iter(self) -> Generator:
             if hasattr(self, "dtypes") and self.dtypes:
                 self.tb_args.precision = "bypass"
                 self.dtype = PRECISION_DTYPE_MAPPING[self.dtypes[shape_id]]
+            requires_grad = self.mode in (Mode.BWD, Mode.FWD_BWD)
             if hasattr(self, "strides"):
                 strides = self.strides[shape_id]
                 assert (
@@ -558,28 +559,32 @@ def get_input_iter(self) -> Generator:
                 actual_n = max(n, strides[1][0])
                 a = self._scaled_randn(
                     (actual_m, actual_k), scale=k, device=self.device, dtype=self.dtype
-                )
+                ).requires_grad_(requires_grad)
                 w = self._scaled_randn(
                     (actual_k, actual_n), scale=k, device=self.device, dtype=self.dtype
+                ).requires_grad_(requires_grad)
+                a = a.as_strided(size=[m, k], stride=strides[0]).requires_grad_(
+                    requires_grad
+                )
+                w = w.as_strided(size=[k, n], stride=strides[1]).requires_grad_(
+                    requires_grad
                 )
-                a = a.as_strided(size=[m, k], stride=strides[0])
-                w = w.as_strided(size=[k, n], stride=strides[1])
             else:
                 a = self._scaled_randn(
                     (m, k), scale=k, device=self.device, dtype=self.dtype
-                )
+                ).requires_grad_(requires_grad)
                 w = self._scaled_randn(
                     (k, n), scale=k, device=self.device, dtype=self.dtype
-                )
+                ).requires_grad_(requires_grad)
                 # Convert inputs to column-major if layout is "n" (non-transposed)
                 if self.layout[0] == "n":
-                    a = a.T.contiguous().T
+                    a = a.T.contiguous().T.requires_grad_(requires_grad)
                 if self.layout[1] == "n":
-                    w = w.T.contiguous().T
+                    w = w.T.contiguous().T.requires_grad_(requires_grad)
             if not bias == None:
                 bias = torch.randn(
                     (bias), device=self.device, dtype=self.dtype
-                ).requires_grad_(False)
+                ).requires_grad_(requires_grad)
 
             yield a, w, bias
 
diff --git a/tritonbench/operators/gemm/partition_k.py b/tritonbench/operators/gemm/partition_k.py
@@ -220,7 +220,7 @@ def torch_reduction(c_buf, a):
 compiled_reduction = torch.compile(torch_reduction)
 
 
-def matmul_partition_k(a, b, triton_reduce=False):
+def _matmul_partition_k_impl(a, b, triton_reduce=False):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
     assert a.is_contiguous(), "Matrix A must be contiguous"
@@ -284,3 +284,33 @@ def matmul_partition_k(a, b, triton_reduce=False):
         return c
     else:
         return compiled_reduction(c_buf, a)
+
+
+class _PartitionKMatmul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b, triton_reduce=False):
+        # Save tensors for backward
+        ctx.save_for_backward(a, b)
+        ctx.triton_reduce = triton_reduce
+        return _matmul_partition_k_impl(a, b, triton_reduce)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b = ctx.saved_tensors
+        grad_a = grad_b = None
+
+        if ctx.needs_input_grad[0]:
+            grad_a = _matmul_partition_k_impl(
+                grad_output, b.t().contiguous(), ctx.triton_reduce
+            )
+
+        if ctx.needs_input_grad[1]:
+            grad_b = _matmul_partition_k_impl(
+                a.t().contiguous(), grad_output, ctx.triton_reduce
+            )
+
+        return grad_a, grad_b, None
+
+
+def matmul_partition_k(a, b, triton_reduce=False):
+    return _PartitionKMatmul.apply(a, b, triton_reduce)
diff --git a/tritonbench/operators/gemm/stream_k.py b/tritonbench/operators/gemm/stream_k.py
@@ -287,7 +287,7 @@ def streamk_amd_gemm(
         start_iter = end_iter
 
 
-def streamk_amd_matmul(a, b, bias=None):
+def _streamk_amd_matmul_impl(a, b, bias=None):
     M, K = a.shape
     _, N = b.shape
     dtype = a.dtype
@@ -391,6 +391,36 @@ def streamk_amd_matmul(a, b, bias=None):
     return c
 
 
+class _StreamKAmdMatmul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b, bias=None):
+        # Save tensors for backward
+        ctx.save_for_backward(a, b, bias)
+        return _streamk_amd_matmul_impl(a, b, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b, bias = ctx.saved_tensors
+        grad_a = grad_b = grad_bias = None
+
+        if ctx.needs_input_grad[0]:
+            grad_a = _streamk_amd_matmul_impl(grad_output, b.t().contiguous())
+
+        if ctx.needs_input_grad[1]:
+            grad_b = _streamk_amd_matmul_impl(a.t().contiguous(), grad_output)
+
+        if ctx.needs_input_grad[2] and bias is not None:
+            grad_bias = grad_output.sum(dim=0)
+            if bias.dim() == 2:
+                grad_bias = grad_bias.unsqueeze(0)
+
+        return grad_a, grad_b, grad_bias
+
+
+def streamk_amd_matmul(a, b, bias=None):
+    return _StreamKAmdMatmul.apply(a, b, bias)
+
+
 def _matmul_launch_metadata(grid, kernel, args):
     ret = {}
     M, N, K = args["M"], args["N"], args["K"]
@@ -601,7 +631,7 @@ def streamk_cuda_gemm(
                 c_desc.atomic_add([offs_am, offs_bn], c)
 
 
-def streamk_cuda_matmul(a, b):
+def _streamk_cuda_matmul_impl(a, b):
     assert a.dtype == b.dtype, "Incompatible dtypes"
 
     M, K = a.shape
@@ -649,3 +679,28 @@ def grid(META):
         NUM_SMS=num_sms,  #
     )
     return c
+
+
+class _StreamKCudaMatmul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b):
+        # Save tensors for backward
+        ctx.save_for_backward(a, b)
+        return _streamk_cuda_matmul_impl(a, b)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b = ctx.saved_tensors
+        grad_a = grad_b = None
+
+        if ctx.needs_input_grad[0]:
+            grad_a = _streamk_cuda_matmul_impl(grad_output, b.t().contiguous())
+
+        if ctx.needs_input_grad[1]:
+            grad_b = _streamk_cuda_matmul_impl(a.t().contiguous(), grad_output)
+
+        return grad_a, grad_b
+
+
+def streamk_cuda_matmul(a, b):
+    return _StreamKCudaMatmul.apply(a, b)
diff --git a/tritonbench/operators/gemm/triton_matmul.py b/tritonbench/operators/gemm/triton_matmul.py
@@ -140,7 +140,7 @@ def leaky_relu(x):
 # and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel.
 
 
-def matmul(a, b, activation=""):
+def _matmul_impl(a, b, activation=""):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
     M, K = a.shape
@@ -176,3 +176,29 @@ def matmul(a, b, activation=""):
         ENABLE_BUFFER_OPS_ASSUMES=enable_buffer_ops_assumes,
     )
     return c
+
+
+class _TritonMatmul(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b, activation=""):
+        # Save tensors for backward
+        ctx.save_for_backward(a, b)
+        ctx.activation = activation
+        return _matmul_impl(a, b, activation)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        a, b = ctx.saved_tensors
+        grad_a = grad_b = None
+
+        if ctx.needs_input_grad[0]:
+            grad_a = _matmul_impl(grad_output, b.t().contiguous(), "")
+
+        if ctx.needs_input_grad[1]:
+            grad_b = _matmul_impl(a.t().contiguous(), grad_output, "")
+
+        return grad_a, grad_b, None
+
+
+def matmul(a, b, activation=""):
+    return _TritonMatmul.apply(a, b, activation)