clean up

manman-ren · manman-ren · commit 7435a0d6b273 · 2025-08-26T13:19:46.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/tritonbench/operators/gdpa/gdpa.py b/tritonbench/operators/gdpa/gdpa.py
@@ -1952,7 +1952,6 @@ def alloc_fn(size: int, alignment: int, stream: int | None):
         ad_to_request_offset = create_dummy_tensor(query)
 
     activation_enum_int = activation_string_to_int(activation)
-    # print("activation_enum_int", activation, activation_enum_int)
     kernel_info = capture_triton(kernel_fn)[grid](
         q,
         query_offset,
diff --git a/tritonbench/operators/gdpa/gdpa_utils.py b/tritonbench/operators/gdpa/gdpa_utils.py
@@ -5,6 +5,7 @@
 from functools import lru_cache
 from typing import Any, List, Optional
 
+# need this for OSS
 import fbgemm_gpu
 
 import torch
diff --git a/tritonbench/operators/gdpa/operator.py b/tritonbench/operators/gdpa/operator.py
@@ -84,28 +84,6 @@ def get_attn_config(config_name, dtype=torch.bfloat16):
     return default_config
 
 
-def get_cutlass_config(dtype=torch.bfloat16):
-    default_config = {
-        "B": 1152,
-        "max_M": 1000,
-        "D": 512,
-        "H": 4,
-        "dense_q_len": 192,
-        "sparsity": 1.0,
-        "dense_q": False,
-        "dff": None,
-        "bias": False,
-        "dtype": dtype,
-        "fused_kv": False,
-        "window_size": None,
-        "broadcast_q": False,
-        "activation": "fast_gelu",
-    }
-    # per event pffn, pma, self_attn share the same setting
-
-    return default_config
-
-
 all_configs = [
     "_".join([event_size, attn_type])
     for event_size in ["long_event", "short_event"]
@@ -323,8 +301,7 @@ def _inner():
 
     def get_input_iter(self) -> Generator:
         for config_name in self.config_names:
-            config = get_cutlass_config(self.dtype)
-            # config = get_attn_config(config_name, self.dtype)
+            config = get_attn_config(config_name, self.dtype)
             B = self.batch
             max_M = self.max_seq_len
             D = self.dim
@@ -433,23 +410,6 @@ def gbps(
         memory_bandwidth_gb_per_sec = memory_size_gb / (ms * 1e-3)
         return memory_bandwidth_gb_per_sec
 
-    @register_metric()
-    def flops(
-        self, fn_name: str, example_inputs: Any, metrics: BenchmarkOperatorMetrics
-    ) -> float:
-        B = self.batch
-        max_M = self.max_seq_len
-        D = self.dim
-        H = self.head
-        config = get_cutlass_config(self.dtype)
-        sparsity = config["sparsity"]
-
-        print("D/dim", D)  # D/self.dim, assume H * dim in script is D
-        total_flops = 4 * B * max_M * sparsity * D * D  # H * self.dim
-        # ms = metrics.latency
-        # print(f"TFLOP/s: {total_flops / 1e9 / ms :.2f}")
-        return total_flops
-
     @register_metric()
     def activation_mb(
         self, fn: Callable, example_inputs: Any, metrics: BenchmarkOperatorMetrics