[BENCH] Use the device fixture for all bench tests (triton-lang#6706)

Jokeren · web-flow · commit e3f9f4323323 · 2025-05-05T12:24:24.000-04:00
Also fixed a typo in `test_swiglu`
diff --git a/bench/tests/conftest.py b/bench/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="store", default="cuda")
+
+
+@pytest.fixture
+def device(request):
+    return request.config.getoption("--device")
diff --git a/bench/tests/test_compaction.py b/bench/tests/test_compaction.py
@@ -9,8 +9,7 @@
     (131, 128, 16, 0.6),
     (496, 128, 16, 0.),
 ])
-def test_compaction(n_tokens, n_cols, k, p):
-    device = "cuda"
+def test_compaction(n_tokens, n_cols, k, p, device):
     yi = torch.rand((n_tokens, n_cols), device=device).argsort(dim=-1)
     yi = yi[:, :k].to(torch.int32)
     yv = torch.randn((n_tokens, k), dtype=torch.bfloat16, device=device)
diff --git a/bench/tests/test_matmul.py b/bench/tests/test_matmul.py
@@ -39,9 +39,8 @@ def mask_indx(idx, n_expts_act):
     return idx
 
 
-def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter):
-    dev = "cuda"
-    logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=dev, requires_grad=True)
+def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter, device="cuda"):
+    logits = torch.randn((m, n_expts_tot), dtype=torch.float16, device=device, requires_grad=True)
     routing_data, gather_idx, scatter_idx = routing(logits, n_expts_act, simulated_ep=n_expt_shards)
     routing_data.gate_scal = None
     gather_idx = gather_idx if do_gather else None
@@ -50,17 +49,18 @@ def init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_
 
 
 def init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode, act_dtype, weight_dtype,
-                      has_y_gammas, requires_grad=True, dev="cuda"):
+                      has_y_gammas, requires_grad=True, device="cuda"):
     torch.manual_seed(0)
     assert mode in {'batched', 'ragged'}
     in_m = m * (n_expts_act if gindx is None else 1)
     out_m = m * (n_expts_act if sindx is None else 1)
     shape_x = (n_expts_tot, in_m, k) if mode == 'batched' else (in_m, k)
-    x = alloc_rand(shape_x, device=dev, dtype=act_dtype, requires_grad=requires_grad)
-    w = alloc_rand((n_expts_tot // n_expt_shards, k, n), device=dev, dtype=weight_dtype, requires_grad=requires_grad)
-    bias = alloc_rand((n_expts_tot // n_expt_shards, n), device=dev, dtype=torch.float32, requires_grad=requires_grad)
-    gs0 = 2**torch.randint(-5, 0, (m * n_expts_act, ), device=dev, dtype=torch.float32, requires_grad=requires_grad)
-    gs1 = 2**torch.randint(-5, 0, (m * n_expts_act, ), device=dev, dtype=torch.float32, requires_grad=requires_grad)
+    x = alloc_rand(shape_x, device=device, dtype=act_dtype, requires_grad=requires_grad)
+    w = alloc_rand((n_expts_tot // n_expt_shards, k, n), device=device, dtype=weight_dtype, requires_grad=requires_grad)
+    bias = alloc_rand((n_expts_tot // n_expt_shards, n), device=device, dtype=torch.float32,
+                      requires_grad=requires_grad)
+    gs0 = 2**torch.randint(-5, 0, (m * n_expts_act, ), device=device, dtype=torch.float32, requires_grad=requires_grad)
+    gs1 = 2**torch.randint(-5, 0, (m * n_expts_act, ), device=device, dtype=torch.float32, requires_grad=requires_grad)
     gs0 = gs0.detach().requires_grad_(requires_grad)
     gs1 = gs1.detach().requires_grad_(requires_grad)
     if mode == 'batched' or (not has_y_gammas) or (has_y_gammas and (gindx is not None) and act_dtype.itemsize >= 2):
@@ -75,12 +75,13 @@ def init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_sh
 # ---------------
 
 
-def init_precision(out_dtype, act_use_flexpoint, weight_use_flexpoint, n_expts_tot=1, mx_ctx=MicroscalingCtx()):
+def init_precision(out_dtype, act_use_flexpoint, weight_use_flexpoint, n_expts_tot=1, mx_ctx=MicroscalingCtx(),
+                   device="cuda"):
     # flexpoint
     make_tensor = lambda val0, val1: torch.tensor([val0, val1] * (n_expts_tot // 2) +
                                                   ([val0]
-                                                   if n_expts_tot % 2 else []), dtype=torch.float32, device="cuda")
-    make_scalar = lambda val: torch.tensor([val], dtype=torch.float32, device="cuda")
+                                                   if n_expts_tot % 2 else []), dtype=torch.float32, device=device)
+    make_scalar = lambda val: torch.tensor([val], dtype=torch.float32, device=device)
     in_flex_data = lambda scale, use_flex: InFlexData(dtype=torch.float8_e5m2, scale=make_scalar(scale)
                                                       ) if use_flex else InFlexData()
     in_flex_edata = lambda scale0, scale1, use_flex: InFlexData(dtype=torch.float8_e5m2, scale=make_tensor(
@@ -211,7 +212,7 @@ class Case:
 @pytest.mark.parametrize("has_y_gammas", [False, True])
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas, is_persistent, n_expts_tot,
-            n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, swizzle_mx_scale):
+            n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, swizzle_mx_scale, device):
     # TODO: remove when Triton FP8 supports proper RTNE
     if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
         pytest.skip("Float8 not tested on A100")
@@ -254,16 +255,17 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
     act_is_float8 = act_dtype.itemsize == 1
     weight_is_float8 = weight_dtype.itemsize == 1
     precision_opt = init_precision(act_dtype, act_is_float8, weight_is_float8 and not is_mixed_input,
-                                   n_expts_tot // n_expt_shards)
+                                   n_expts_tot // n_expt_shards, device=device)
     # precision_opt.x_pad_trans_requires_flexpoint = False
     if mode == "ragged":
-        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter)
+        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter,
+                                                   device=device)
     else:
         rdata = gindx = sindx = None
     x_tri, w_tri, bias_tri, gs0_tri, gs1_tri = init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act,
                                                                  n_expt_shards, mode, act_dtype,  #
                                                                  torch.bfloat16 if is_mixed_input else weight_dtype,
-                                                                 has_y_gammas, requires_grad=test_bwd)
+                                                                 has_y_gammas, requires_grad=test_bwd, device=device)
     x_ref, w_ref, bias_ref, gs0_ref, gs1_ref = apply_precision(x_tri, w_tri, bias_tri, gs0_tri, gs1_tri, precision_opt)
 
     if is_mixed_input:
diff --git a/bench/tests/test_routing.py b/bench/tests/test_routing.py
@@ -6,12 +6,11 @@
 from triton_bench.testing import assert_equal
 
 
-def init_data(n_tokens, n_expts_tot, dtype=torch.float16):
-    dev = "cuda"
+def init_data(n_tokens, n_expts_tot, dtype=torch.float16, device="cuda"):
     # the reference implementation and the triton implementation do not tie-break experts the same way
     randbits = [torch.randperm(n_expts_tot) for _ in range(n_tokens)]
     x = [(-1)**i * ((16384 + ((i * 512) % 4096) + bits).to(torch.int16).view(dtype)) for i, bits in enumerate(randbits)]
-    return torch.stack(x).to(device=dev)
+    return torch.stack(x).to(device=device)
 
 
 def ref_expt_data(routing_data, n_gates, block_m):
@@ -46,9 +45,9 @@ def ref_expt_data(routing_data, n_gates, block_m):
 @pytest.mark.parametrize("n_tokens", [371, 255, 256, 8192, 1023, 1024])
 @pytest.mark.parametrize("n_expts_tot, n_expts_act", [(128, 4), (1500, 8)])
 @pytest.mark.parametrize("block_m", [64, 128])
-def test_op(n_tokens, n_expts_tot, n_expts_act, block_m):
+def test_op(n_tokens, n_expts_tot, n_expts_act, block_m, device):
     torch.manual_seed(2)
-    tri_logits = init_data(n_tokens, n_expts_tot).detach()
+    tri_logits = init_data(n_tokens, n_expts_tot, device=device).detach()
     ref_logits = tri_logits.clone()
     ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act)
     tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act)
diff --git a/bench/tests/test_swiglu.py b/bench/tests/test_swiglu.py
@@ -26,10 +26,8 @@ def alloc_rand(shape, device, dtype, requires_grad=True):
 
 @pytest.mark.parametrize("M, N", [(1311, 4352)])
 @pytest.mark.parametrize("limit", [1e-2, 10])
-def test_op(M, N, limit, alpha=0.5):
+def test_op(M, N, limit, device, alpha=0.5):
     torch.manual_seed(2)
-    dev = "cuda"
-    dtype = torch.bfloat16
     # initialize expert data
     n_expts_tot = 6
     n_expts_act = 2
@@ -39,8 +37,8 @@ def test_op(M, N, limit, alpha=0.5):
     n_tokens = expt_data[2 * n_expts_tot].sum()
 
     # initialize data
-    x = alloc_rand([n_tokens, N], device=dev, dtype=dtype)
+    x = alloc_rand([n_tokens, N], device=device, dtype=torch.bfloat16)
     precision_config = PrecisionConfig(limit=limit)
-    tri_y = swiglu(x, alpha, precision_config, expt_data, n_expts_tot)
+    tri_y = swiglu(x, alpha, precision_config, routing_data, n_expts_tot)
     ref_y = swiglu_torch(x, alpha, precision_config)
     assert_close(tri_y, ref_y)