gather_gemv: fix eager impl to support large input; fix compile impl (#455)

yf225 · web-flow · commit 2c15edbf1a18 · 2025-09-18T20:30:03.000-07:00
diff --git a/tritonbench/operators/gather_gemv/operator.py b/tritonbench/operators/gather_gemv/operator.py
@@ -44,11 +44,25 @@ def test_0(self, p1, p2, p3) -> Callable:
 
     @register_benchmark(baseline=True)
     def test_eager(self, w, idx, x):
-        return lambda: w[idx].to(x.dtype) @ x
+        s = x.size(0)
+
+        if s <= 8192:
+            return lambda: w[idx].to(x.dtype) @ x
+
+        # For very large matrices (e.g. S=16384) the batched advanced indexing
+        # path above launches a CUDA kernel with an invalid configuration.
+        # Fall back to per-expert slicing which is slower but robust.
+        def eager_impl():
+            outputs = []
+            for idx_val in idx.tolist():
+                outputs.append(w[idx_val].to(x.dtype) @ x)
+            return torch.stack(outputs, dim=0)
+
+        return eager_impl
 
     @register_benchmark()
     def test_inductor(self, w, idx, x):
-        @torch.compile
+        @torch.compile(mode="max-autotune-no-cudagraphs")
         def gather_gemv(w, idx, x):
             return w[idx].to(x.dtype) @ x