Add missing torch.compile impl / improve compile config (#380)

yf225 · web-flow · commit 87c43756df0b · 2025-09-02T12:25:34.000-07:00
diff --git a/tritonbench/operators/cross_entropy/operator.py b/tritonbench/operators/cross_entropy/operator.py
@@ -69,7 +69,9 @@ def liger_cross_entropy_loss(self, input, target) -> Callable:
 
     @register_benchmark()
     def inductor_cross_entropy_loss(self, input, target) -> Callable:
-        compiled = torch.compile(self.baseline_model, dynamic=False)
+        compiled = torch.compile(
+            self.baseline_model, dynamic=False, mode="max-autotune-no-cudagraphs"
+        )
         return lambda: compiled(input, target)
 
     @register_x_val(label="(B, T, V)")
diff --git a/tritonbench/operators/embedding/operator.py b/tritonbench/operators/embedding/operator.py
@@ -53,7 +53,7 @@ def liger_embedding(self, V, D, input, shared_weight) -> Callable:
     def inductor_embedding(self, V, D, input, shared_weight) -> Callable:
         self.baseline_op = Embedding(V, D).to(self.device).to(self.dtype)
         self.baseline_op.weight.data.copy_(shared_weight)
-        compiled = torch.compile(self.baseline_op)
+        compiled = torch.compile(self.baseline_op, mode="max-autotune-no-cudagraphs")
         return lambda: compiled(input)
 
     @register_x_val(label="(B, T, D, V)")
diff --git a/tritonbench/operators/jagged_mean/operator.py b/tritonbench/operators/jagged_mean/operator.py
@@ -111,14 +111,23 @@ def __init__(
 
         self.tensor_bytes_limit = get_tensor_bytes_limit(tb_args.test_only)
 
-    @register_benchmark(baseline=True)
+    @register_benchmark()
     def torch_jagged_mean_unbind_torch_mean(
         self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
     ):
         return lambda: torch.cat(
             [torch.mean(t, dim=0).unsqueeze(0) for t in x.unbind()]
         )  # in 3D tensor (B, *, M), takes the mean of B 2D tensors (*, M)
 
+    @register_benchmark()
+    def torch_compile_jagged_mean_unbind_torch_mean(
+        self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
+    ):
+        return torch.compile(
+            self.torch_jagged_mean_unbind_torch_mean(x, B, M, seqlen, sparsity),
+            mode="max-autotune-no-cudagraphs",
+        )
+
     @register_benchmark()
     def torch_jagged_mean_torch_nanmean(
         self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
@@ -138,6 +147,15 @@ def torch_jagged_mean_torch_nanmean(
         )
 
     @register_benchmark()
+    def torch_compile_jagged_mean_torch_nanmean(
+        self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
+    ):
+        return torch.compile(
+            self.torch_jagged_mean_torch_nanmean(x, B, M, seqlen, sparsity),
+            mode="max-autotune-no-cudagraphs",
+        )
+
+    @register_benchmark(baseline=True)
     def torch_jagged_mean_torch_sum(
         self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
     ):
@@ -155,6 +173,15 @@ def torch_jagged_mean_torch_sum(
             / x.offsets().diff().unsqueeze(1)
         )
 
+    @register_benchmark()
+    def torch_compile_jagged_mean_torch_sum(
+        self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
+    ):
+        return torch.compile(
+            self.torch_jagged_mean_torch_sum(x, B, M, seqlen, sparsity),
+            mode="max-autotune-no-cudagraphs",
+        )
+
     @register_benchmark()
     def triton_jagged_mean_simple_fused(
         self, x: torch.Tensor, B: int, M: int, seqlen: int, sparsity: float
@@ -182,7 +209,7 @@ def _inner(x: torch.Tensor):  # mean along ragged dimension (dim == 1)
                 x, dim=x._ragged_idx, keepdim=True
             )  # pyre-ignore: Undefined attribute [16]: `torch._tensor.Tensor` has no attribute `_ragged_idx`.
 
-        torch_compile_func = torch.compile(_inner)
+        torch_compile_func = torch.compile(_inner, mode="max-autotune-no-cudagraphs")
         return lambda: torch_compile_func(x)
 
     def get_x_val(self, example_inputs):
diff --git a/tritonbench/operators/layer_norm/operator.py b/tritonbench/operators/layer_norm/operator.py
@@ -1,4 +1,5 @@
-from typing import Callable, List
+import argparse
+from typing import Callable, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -10,10 +11,28 @@
     Mode,
     register_benchmark,
     register_metric,
+    register_x_val,
 )
 
 from . import tutorial
 
+
+def parse_op_args(args: List[str]):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--M",
+        type=int,
+        default=4096,
+        help="[Optional] Size of dimension 0 in input shape (integer), default: 4096",
+    )
+    parser.add_argument(
+        "--N",
+        type=int,
+        help="[Optional] Size of dimension 1 in input shape (integer)",
+    )
+    return parser.parse_args(args)
+
+
 try:
     from liger_kernel.ops.layer_norm import LigerLayerNormFunction
 
@@ -24,6 +43,14 @@
 
 
 class Operator(BenchmarkOperator):
+    def __init__(
+        self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
+    ):
+        super().__init__(tb_args, extra_args)
+        args = parse_op_args(self.extra_args)
+        self.M = args.M
+        self.N = args.N
+
     @register_benchmark()
     def triton_layer_norm(self, *args):
         return lambda: tutorial.layer_norm(*args)
@@ -43,7 +70,7 @@ def torch_compile_layer_norm(self, *args):
             functorch_config.donated_buffer = False
         import torch
 
-        @torch.compile
+        @torch.compile(mode="max-autotune-no-cudagraphs")
         def inner(*args):
             return F.layer_norm(*args)
 
@@ -64,10 +91,16 @@ def get_grad_to_none(self, args) -> List[torch.Tensor]:
         return [x]
 
     def get_input_iter(self):
-        M = 4096
         eps = 1e-5
-        for N in [512 * i for i in range(2, 32)]:
-            x_shape = (M, N)
+
+        # If N is provided, use only that value; otherwise use the default range
+        if self.N is not None:
+            N_values = [self.N]
+        else:
+            N_values = [512 * i for i in range(2, 32)]
+
+        for N in N_values:
+            x_shape = (self.M, N)
             w_shape = (x_shape[-1],)
             x = -2.3 + 0.5 * torch.randn(
                 x_shape,
@@ -83,9 +116,10 @@ def get_input_iter(self):
             )
             yield (x, w_shape, weight, bias, eps)
 
+    @register_x_val(label="(M, N)")
     def get_x_val(self, args):
-        _, N = args[0].shape
-        return N
+        M, N = args[0].shape
+        return (M, N)
 
     @register_metric()
     def gbps(self, fn, args, metrics: BenchmarkOperatorMetrics) -> float:
@@ -114,7 +148,7 @@ def plot(self):
                 styles=[("blue", "-"), ("green", "-")],
                 ylabel="GB/s",
                 plot_name="layer-norm-fwd",
-                args={"M": 4096},
+                args={"M": self.M},
             )
         )
         def _plot(M, N, provider):
diff --git a/tritonbench/operators/rms_norm/operator.py b/tritonbench/operators/rms_norm/operator.py
@@ -29,6 +29,22 @@
     QuackRMSNorm = None
 
 
+def parse_op_args(args: List[str]):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--M",
+        type=int,
+        default=2048,
+        help="[Optional] Size of dimension 0 in input shape (integer), default: 2048",
+    )
+    parser.add_argument(
+        "--H",
+        type=int,
+        help="[Optional] Hidden size dimension (integer)",
+    )
+    return parser.parse_args(args)
+
+
 # Reference: https://github.com/linkedin/Liger-Kernel/
 # blob/main/benchmark/scripts/benchmark_rms_norm.py
 
@@ -55,14 +71,22 @@ def __init__(
         self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
     ):
         super().__init__(tb_args, extra_args)
-        self.M = 2048
+        args = parse_op_args(self.extra_args)
+        self.M = args.M
+        self.H = args.H
         self.eps = 1e-6
         # they are generated later
         self.llama_rms_op = None
         self.liger_rms_op = None
 
     def get_input_iter(self) -> Generator:
-        for H in [2**i for i in range(10, 16)]:
+        # If H is provided, use only that value; otherwise use the default range
+        if self.H is not None:
+            H_values = [self.H]
+        else:
+            H_values = [2**i for i in range(10, 16)]
+
+        for H in H_values:
             x_shape = (self.M, H)
             _input = torch.randn(x_shape, dtype=self.dtype, device=self.device)
             yield H, _input
@@ -88,7 +112,7 @@ def inductor_rms(self, H, input) -> Callable:
             self.llama_rms_op = LlamaRMSNorm(hidden_size=H, eps=self.eps).to(
                 self.device
             )
-        compiled = torch.compile(self.llama_rms_op)
+        compiled = torch.compile(self.llama_rms_op, mode="max-autotune-no-cudagraphs")
         return lambda: compiled(input)
 
     @register_benchmark(enabled=is_hip() and HAS_AITER)
@@ -98,7 +122,8 @@ def aiter(self, H, input) -> Callable:
 
     @register_x_val(label="(M, H)")
     def get_x_val(self, example_inputs) -> Tuple[int, int]:
-        return (self.M, example_inputs[0])
+        H = example_inputs[0]
+        return (self.M, H)
 
     def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
         y = fwd_fn()
diff --git a/tritonbench/operators/softmax/operator.py b/tritonbench/operators/softmax/operator.py
@@ -1,4 +1,5 @@
-from typing import Generator, List
+import argparse
+from typing import Generator, List, Optional
 
 import torch
 import triton
@@ -13,6 +14,7 @@
     BenchmarkOperatorMetrics,
     register_benchmark,
     register_metric,
+    register_x_val,
 )
 
 try:
@@ -23,9 +25,33 @@
     HAS_QUACK = False
 
 
+def parse_op_args(args: List[str]):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--M",
+        type=int,
+        default=4096,
+        help="[Optional] Size of dimension 0 in input shape (integer), default: 4096",
+    )
+    parser.add_argument(
+        "--N",
+        type=int,
+        help="[Optional] Size of dimension 1 in input shape (integer)",
+    )
+    return parser.parse_args(args)
+
+
 class Operator(BenchmarkOperator):
     is_compute_bound = False
 
+    def __init__(
+        self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
+    ):
+        super().__init__(tb_args, extra_args)
+        args = parse_op_args(self.extra_args)
+        self.M = args.M
+        self.N = args.N
+
     @register_benchmark()
     def triton_softmax(self, x):
         n_rows, n_cols = x.shape
@@ -117,21 +143,35 @@ def quack(self, x):
         inner = lambda: quack_softmax(x)
         return inner
 
+    @register_benchmark()
+    def torch_compile_softmax(self, x):
+        @torch.compile(mode="max-autotune-no-cudagraphs")
+        def _inner(x):
+            return torch.nn.functional.softmax(x, dim=1)
+
+        return lambda: _inner(x)
+
     def get_input_iter(self):
-        M = 4096
-        shapes = [(M, 128 * i) for i in range(2, 100)]
+        # If N is provided, use only that value; otherwise use the default range
+        if self.N is not None:
+            shapes = [(self.M, self.N)]
+        else:
+            shapes = [(self.M, 128 * i) for i in range(2, 100)]
+
         if is_fbcode() and self.tb_args.production_shapes:
             additional_shapes = get_production_shapes(
                 self.name, "softmax", self.tb_args.shuffle_shapes
             )
             if additional_shapes:
                 shapes.extend(additional_shapes)
+
         for M, N in shapes:
             yield (torch.randn([M, N], dtype=self.dtype, device=self.device),)
 
+    @register_x_val(label="(M, N)")
     def get_x_val(self, example_inputs):
-        shape = example_inputs[0].size()
-        return [shape[0], shape[1]]
+        M, N = example_inputs[0].shape
+        return (M, N)
 
     @register_metric()
     def gbps(self, fn, example_inputs, metrics: BenchmarkOperatorMetrics) -> float:
@@ -161,7 +201,7 @@ def plot(self):
                 ylabel="GB/s",  # label name for the y-axis
                 plot_name="softmax-performance",  # name for the plot. Used also as a file name for saving the plot.
                 args={
-                    "M": 4096
+                    "M": self.M
                 },  # values for function arguments not in `x_names` and `y_name`
             )
         )
diff --git a/tritonbench/operators/sum/operator.py b/tritonbench/operators/sum/operator.py
@@ -197,6 +197,14 @@ def _inner():
     def torch_sum(self, x: torch.Tensor):
         return lambda: torch.sum(x, dim=self.reduce_dim)
 
+    @register_benchmark()
+    def torch_compile_sum(self, x: torch.Tensor):
+        @torch.compile(mode="max-autotune-no-cudagraphs")
+        def _inner(x):
+            return torch.sum(x, dim=self.reduce_dim)
+
+        return lambda: _inner(x)
+
     def get_x_val(self, example_inputs):
         if self.M is None:
             return example_inputs[0].shape[0]
diff --git a/tritonbench/operators/vector_add/operator.py b/tritonbench/operators/vector_add/operator.py
@@ -57,6 +57,14 @@ def _inner():
     def torch_add(self, x: torch.Tensor, y: torch.Tensor):
         return lambda: x + y
 
+    @register_benchmark()
+    def torch_compile_add(self, x: torch.Tensor, y: torch.Tensor):
+        @torch.compile(mode="max-autotune-no-cudagraphs")
+        def _inner(x, y):
+            return x + y
+
+        return lambda: _inner(x, y)
+
     def get_x_vals(self) -> List[int]:
         return [2**i for i in range(12, 28, 1)]
 
diff --git a/tritonbench/operators/vector_exp/operator.py b/tritonbench/operators/vector_exp/operator.py