[bwd] Add default backward pass function (#520)

xuzhao9 · web-flow · commit ad40bedbe226 · 2025-10-07T11:13:49.000-04:00
diff --git a/tritonbench/operators/addmm/operator.py b/tritonbench/operators/addmm/operator.py
@@ -81,6 +81,7 @@
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "best_config"]
     DEFAULT_PRECISION = "fp16"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/bf16xint16_gemm/bf16xint16_gemm.py b/tritonbench/operators/bf16xint16_gemm/bf16xint16_gemm.py
@@ -35,6 +35,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "gbps", "latency"]
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/cross_entropy/operator.py b/tritonbench/operators/cross_entropy/operator.py
@@ -79,11 +79,6 @@ def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         v = example_inputs[0].size(-1)
         return (self.B, self.T, v)
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        # TODO: how to pass grad_to_none=[_input]?
-        return lambda: y.backward(retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         x = args[0]
         return [x]
diff --git a/tritonbench/operators/embedding/operator.py b/tritonbench/operators/embedding/operator.py
@@ -60,8 +60,3 @@ def torch_compile_embedding(self, V, D, input, shared_weight) -> Callable:
     def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         V, D, input_tensor, _ = example_inputs
         return (input_tensor.size(0), input_tensor.size(1), D, V)
-
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        do = torch.randn_like(y)
-        return lambda: y.backward(do, retain_graph=True)
diff --git a/tritonbench/operators/flex_attention/operator.py b/tritonbench/operators/flex_attention/operator.py
@@ -419,16 +419,6 @@ def sdpa_fn():
 
         return sdpa_fn
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        o = fwd_fn()
-        o_tensor = input_filter(
-            lambda x: isinstance(x, torch.Tensor) and x.requires_grad,
-            o,
-        )
-        assert o_tensor is not None, "No tensor found in output that requires grad."
-        do = torch.rand_like(o_tensor)
-        return lambda: o_tensor.backward(do, retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         """Return tensors whose gradients should be set to None between iterations."""
         q, k, v, *_ = args
diff --git a/tritonbench/operators/fp8_attention/operator.py b/tritonbench/operators/fp8_attention/operator.py
@@ -53,6 +53,7 @@ def parse_op_args(args: List[str]):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "tflops"]
     DEFAULT_PRECISION = "fp8"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -58,6 +58,7 @@ def parse_args(args):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "gbps", "latency"]
     DEFAULT_PRECISION = "fp8"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/fp8_gemm_blockwise/operator.py b/tritonbench/operators/fp8_gemm_blockwise/operator.py
@@ -123,6 +123,7 @@ def fp8_block_quantize(
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "speedup", "accuracy"]
     DEFAULT_PRECISION = "fp8"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py b/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py
@@ -333,6 +333,7 @@ class Operator(BenchmarkOperator):
 
     DEFAULT_METRICS = ["tflops", "gbps", "speedup", "accuracy"]
     DEFAULT_PRECISION = "fp8"
+    FWD_ONLY = True
 
     def __init__(
         self,
diff --git a/tritonbench/operators/fused_linear_cross_entropy/operator.py b/tritonbench/operators/fused_linear_cross_entropy/operator.py
@@ -108,7 +108,3 @@ def torch_compile_fused_linear_cross_entropy(
     @register_x_val(label="(B*T, H)")
     def get_x_val(self, example_inputs) -> Tuple[int, int]:
         return (example_inputs[0].size(0), example_inputs[0].size(1))
-
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        return lambda: y.backward(retain_graph=True)
diff --git a/tritonbench/operators/fused_linear_jsd/operator.py b/tritonbench/operators/fused_linear_jsd/operator.py
@@ -171,10 +171,6 @@ def torch_compile_lm_head_jsd(self, student_input, teacher_input) -> Callable:
     def get_x_val(self, example_inputs) -> Tuple[int, int]:
         return (example_inputs[0].size(0), example_inputs[0].size(1))
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        return lambda: y.backward(retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         student_input = args[0]
         return [
diff --git a/tritonbench/operators/gather_gemv/operator.py b/tritonbench/operators/gather_gemv/operator.py
@@ -21,6 +21,8 @@
 
 
 class Operator(BenchmarkOperator):
+    FWD_ONLY = True
+
     @register_metric()
     def gbps(self, fn, example_inputs, metrics: BenchmarkOperatorMetrics):
         arg0_1, arg1_1, arg2_1 = example_inputs
diff --git a/tritonbench/operators/geglu/operator.py b/tritonbench/operators/geglu/operator.py
@@ -90,8 +90,3 @@ def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
             example_inputs[0].size(1),
             example_inputs[0].size(2),
         )
-
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        do = torch.randn_like(y)
-        return lambda: y.backward(do, retain_graph=True)
diff --git a/tritonbench/operators/gemm/operator.py b/tritonbench/operators/gemm/operator.py
@@ -176,6 +176,7 @@ def read_shapes_from_csv(csv_path: str) -> List[List[int]]:
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "speedup", "tflops"]
     DEFAULT_PRECISION = "fp16"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/grouped_gemm/operator.py b/tritonbench/operators/grouped_gemm/operator.py
@@ -53,6 +53,7 @@ def get_default_shapes():
 class Operator(BenchmarkOperator):
     DEFAULT_PRECISION = "bf16"
     DEFAULT_METRICS = ["latency", "speedup", "accuracy"]
+    FWD_ONLY = True
 
     @register_benchmark(baseline=True)
     def aten_grouped_mm(self, group_A, group_B):
diff --git a/tritonbench/operators/int4_gemm/int4_gemm.py b/tritonbench/operators/int4_gemm/int4_gemm.py
@@ -26,6 +26,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "gbps", "latency", "best_config"]
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/jagged_layer_norm/operator.py b/tritonbench/operators/jagged_layer_norm/operator.py
@@ -39,6 +39,7 @@ def parse_op_args(args: List[str]):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "accuracy"]
     DEFAULT_PRECISION = "fp32"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/jagged_mean/operator.py b/tritonbench/operators/jagged_mean/operator.py
@@ -92,6 +92,7 @@ def execute_kernel_variable_length_loop(x, sum_then_buffer):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "accuracy"]
     DEFAULT_PRECISION = "fp32"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/jagged_softmax/operator.py b/tritonbench/operators/jagged_softmax/operator.py
@@ -75,6 +75,7 @@ def parse_op_args(args: List[str]):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "accuracy", "best_config"]
     DEFAULT_PRECISION = "fp32"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/jagged_sum/operator.py b/tritonbench/operators/jagged_sum/operator.py
@@ -97,6 +97,7 @@ def execute_kernel_variable_length_loop(x, sum_then_buffer):
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "best_config"]
     DEFAULT_PRECISION = "fp32"
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/jsd/operator.py b/tritonbench/operators/jsd/operator.py
@@ -94,10 +94,6 @@ def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         input_tensor = example_inputs[0]
         return (self.B, self.T, input_tensor.size(1))
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        return lambda: y.backward(retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         x = args[0]
         return [x]
diff --git a/tritonbench/operators/kl_div/operator.py b/tritonbench/operators/kl_div/operator.py
@@ -54,9 +54,5 @@ def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         input_tensor = example_inputs[0]
         return (self.B, self.T, input_tensor.size(1))
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        return lambda: y.backward(retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         return [args[0]]
diff --git a/tritonbench/operators/launch_latency/operator.py b/tritonbench/operators/launch_latency/operator.py
@@ -17,6 +17,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["walltime"]
+    FWD_ONLY = True
 
     def get_input_iter(self):
         yield tuple()
diff --git a/tritonbench/operators/layer_norm/operator.py b/tritonbench/operators/layer_norm/operator.py
@@ -102,44 +102,6 @@ def quack_layer_norm(self, *args) -> Callable:
         (x, w_shape, weight, bias, eps) = args
         return lambda: quack_layernorm(x, weight, eps, bias)
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        from torch.utils._pytree import tree_map
-
-        # Extract tensors that require gradients from example_inputs
-        grad_tensors = []
-
-        def extract_if_requires_grad(x):
-            if isinstance(x, torch.Tensor) and x.requires_grad:
-                grad_tensors.append(x)
-            return x
-
-        # Use tree_map to find all grad tensors in example_inputs
-        # example_inputs is set by the benchmark framework and contains the current input
-        tree_map(extract_if_requires_grad, self.example_inputs)
-
-        state = {"y": None, "dy": None}
-
-        def bwd_fn():
-            # Clear existing gradients
-            for t in grad_tensors:
-                if t.grad is not None:
-                    t.grad = None
-
-            # Initialize on first call
-            if state["y"] is None:
-                output = fwd_fn()
-                state["y"] = output[0] if isinstance(output, tuple) else output
-                torch.manual_seed(0)
-                state["dy"] = 0.1 * torch.randn_like(state["y"])
-
-            # Run backward
-            state["y"].backward(state["dy"], retain_graph=True)
-
-            # Return the tensors (not gradients) for accuracy checking
-            return grad_tensors
-
-        return bwd_fn
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         x = args[0]
         return [x]
diff --git a/tritonbench/operators/low_mem_dropout/operator.py b/tritonbench/operators/low_mem_dropout/operator.py
@@ -14,6 +14,8 @@
 
 
 class Operator(BenchmarkOperator):
+    FWD_ONLY = True
+
     @register_metric()
     def gbps(self, fn, example_inputs, metrics: BenchmarkOperatorMetrics):
         return (
diff --git a/tritonbench/operators/mixed_gemm/operator.py b/tritonbench/operators/mixed_gemm/operator.py
@@ -80,7 +80,7 @@ def _generate_default_shapes():
 
 class Operator(BenchmarkOperator):
     DEFAULT_PRECISION = "bf16"
-
+    FWD_ONLY = True
     DEFAULT_METRICS = ["latency", "speedup"]
 
     def __init__(
diff --git a/tritonbench/operators/ragged_attention/operator.py b/tritonbench/operators/ragged_attention/operator.py
@@ -140,16 +140,6 @@ def get_input_iter(self):
                 requires_grad=self.requires_grad,
             )
 
-    def get_bwd_fn(self, fwd_fn: Callable[..., Any]) -> Callable[..., Any]:
-        o = fwd_fn()
-        o_tensor = input_filter(
-            lambda x: isinstance(x, torch.Tensor),
-            o,
-        )
-        do = torch.rand_like(o_tensor)
-        fn = lambda: o_tensor.backward(do, retain_graph=True)
-        return fn
-
     def _flops(
         self,
         batch_size: int,
diff --git a/tritonbench/operators/rms_norm/operator.py b/tritonbench/operators/rms_norm/operator.py
@@ -167,40 +167,3 @@ def tilelang(self, H, input, weight) -> Callable:
     def get_x_val(self, example_inputs) -> Tuple[int, int]:
         H = example_inputs[0]
         return (self.M, H)
-
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        from torch.utils._pytree import tree_map
-
-        # Extract tensors that require gradients from example_inputs
-        grad_tensors = []
-
-        def extract_if_requires_grad(x):
-            if isinstance(x, torch.Tensor) and x.requires_grad:
-                grad_tensors.append(x)
-            return x
-
-        # Use tree_map to find all grad tensors in example_inputs
-        # example_inputs is set by the benchmark framework and contains the current input
-        tree_map(extract_if_requires_grad, self.example_inputs)
-
-        state = {"y": None, "dy": None}
-
-        def bwd_fn():
-            # Clear existing gradients
-            for t in grad_tensors:
-                if t.grad is not None:
-                    t.grad = None
-
-            # Initialize on first call
-            if state["y"] is None:
-                state["y"] = fwd_fn()
-                torch.manual_seed(0)
-                state["dy"] = 0.1 * torch.randn_like(state["y"])
-
-            # Run backward
-            state["y"].backward(state["dy"], retain_graph=True)
-
-            # Return the tensors (not gradients) for accuracy checking
-            return grad_tensors
-
-        return bwd_fn
diff --git a/tritonbench/operators/softmax/operator.py b/tritonbench/operators/softmax/operator.py
@@ -43,6 +43,7 @@ def parse_op_args(args: List[str]):
 
 class Operator(BenchmarkOperator):
     DEFAULT_PRECISION = "fp16"
+    FWD_ONLY = True
     is_compute_bound = False
 
     def __init__(
diff --git a/tritonbench/operators/sum/operator.py b/tritonbench/operators/sum/operator.py
@@ -148,6 +148,7 @@ def execute_kernel_2D_result(x):
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "accuracy", "best_config"]
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/swiglu/operator.py b/tritonbench/operators/swiglu/operator.py
@@ -84,11 +84,6 @@ def torch_compile_swiglu(self, input) -> Callable:
     def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         return (self.B, example_inputs[0].size(1), example_inputs[0].size(2))
 
-    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
-        y = fwd_fn()
-        do = torch.randn_like(y)
-        return lambda: y.backward(do, retain_graph=True)
-
     def get_grad_to_none(self, args) -> List[torch.Tensor]:
         return [args[0]]
 
diff --git a/tritonbench/operators/template_attention/operator.py b/tritonbench/operators/template_attention/operator.py
@@ -29,6 +29,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "speedup", "accuracy"]
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/operators/vector_add/operator.py b/tritonbench/operators/vector_add/operator.py
@@ -15,6 +15,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "gbps"]
+    FWD_ONLY = True
 
     @register_metric()
     def gbps(self, fn, example_inputs, metrics: BenchmarkOperatorMetrics):
diff --git a/tritonbench/operators/welford/operator.py b/tritonbench/operators/welford/operator.py
@@ -1,17 +1,11 @@
 import argparse
-import csv
-import os
-import statistics
 from typing import Any, Callable, Generator, List, Optional
 
-import numpy
 import torch
-import triton
 from torch._dynamo.testing import rand_strided, same
 
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
-    BenchmarkOperatorMetrics,
     register_benchmark,
     register_metric,
 )
@@ -38,6 +32,7 @@
 
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["latency", "speedup", "accuracy"]
+    FWD_ONLY = True
 
     def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py