[BENCH] Remove TMA workaround in swiglu (triton-lang#6711)

aeng-openai · web-flow · commit 7ad7cee11aa0 · 2025-05-05T13:06:13.000-07:00
TMA is no longer needed in this kernel after the convert layout cost model added in triton-lang#6699 Also fix test_swiglu.py. It broke after triton-lang#6703  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [x] This PR does not need a test because it already has test coverage. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
diff --git a/bench/tests/test_swiglu.py b/bench/tests/test_swiglu.py
@@ -5,7 +5,6 @@
 import pytest
 
 from .test_routing import init_data as init_routing_data
-from .test_routing import ref_expt_data
 
 # ---------------
 # initialize data
@@ -33,8 +32,7 @@ def test_op(M, N, limit, device, alpha=0.5):
     n_expts_act = 2
     logits = init_routing_data(M, n_expts_tot).detach()
     routing_data, _, _ = routing_torch(logits, n_expts_act)
-    expt_data = ref_expt_data(routing_data, M * n_expts_act, block_m=128)
-    n_tokens = expt_data[2 * n_expts_tot].sum()
+    n_tokens = routing_data.expt_hist.sum()
 
     # initialize data
     x = alloc_rand([n_tokens, N], device=device, dtype=torch.bfloat16)
diff --git a/bench/triton_bench/swiglu.py b/bench/triton_bench/swiglu.py
@@ -2,7 +2,6 @@
 from triton_bench.numerics import InFlexData, OutFlexData
 import torch
 import triton
-from triton.tools.tensor_descriptor import TensorDescriptor
 from .swiglu_details._swiglu import _swiglu
 from triton_bench import target_info
 from .matmul_ogs_details.metadata import compute_metadata
@@ -35,17 +34,6 @@ def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
         BLOCK_M, BLOCK_N = 32 // a.itemsize, 128
         num_warps = 4
         kwargs = {'maxnreg': 64} if not target_info.is_hip() else {}
-        # TMA descriptors
-        out_desc = None
-        a_desc = None
-        if target_info.cuda_capability_geq(9, 0) and flex_ctx.out_data.actual_scale is not None:
-            # We need TMA to store the outputs otherwise Triton will aggressively removing layout conversions at
-            # the cost of duplicating too much compute. With TMA, the layout conversion gets folded into the TMA store,
-            # and the duplication doesn't occur.
-            assert out.shape[-1] * out.element_size() % 16 == 0
-            out_desc = TensorDescriptor.from_tensor(out, (BLOCK_M, BLOCK_N))
-            assert a.shape[-1] * a.element_size() % 16 == 0
-            a_desc = TensorDescriptor.from_tensor(a, (BLOCK_M, 2 * BLOCK_N))
         # launch semi-persistent kernel
         N_BLOCKS = triton.cdiv(N // 2, BLOCK_N)
         num_sms = target_info.num_sms()
@@ -64,12 +52,10 @@ def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
         if routing_data is not None:
             expt_data = compute_metadata(routing_data, M, BLOCK_M).buffer
         _swiglu[grid](
-            out_desc,
             flex_ctx.out_data.reinterpret(out),
             flex_ctx.out_data.expected_scale,
             flex_ctx.out_data.actual_scale,
             flex_ctx.out_data.checksum_scale,
-            a_desc,
             flex_ctx.inp_data.reinterpret(a),
             flex_ctx.inp_data.scale,
             alpha,
diff --git a/bench/triton_bench/swiglu_details/_swiglu.py b/bench/triton_bench/swiglu_details/_swiglu.py
@@ -35,10 +35,9 @@ def swiglu_launch_metadata(grid, kernel, args):
 
 
 @triton.jit(repr=swiglu_repr, launch_metadata=swiglu_launch_metadata)
-def _swiglu(out_desc, Out, OutExpectedScale, OutActualScale, OutChecksumScale, a_desc, A, AScale, alpha, M, N,
-            stride_am, stride_an, stride_outm, stride_outn, limit: tl.constexpr, ExptData, NUM_EXPERTS: tl.constexpr,
-            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, EVEN_N: tl.constexpr, M_BLOCKS, N_BLOCKS,
-            flexpoint_saturate_inf: tl.constexpr):
+def _swiglu(Out, OutExpectedScale, OutActualScale, OutChecksumScale, A, AScale, alpha, M, N, stride_am, stride_an,
+            stride_outm, stride_outn, limit: tl.constexpr, ExptData, NUM_EXPERTS: tl.constexpr, BLOCK_M: tl.constexpr,
+            BLOCK_N: tl.constexpr, EVEN_N: tl.constexpr, M_BLOCKS, N_BLOCKS, flexpoint_saturate_inf: tl.constexpr):
     if ExptData is not None:
         M = tl.load(ExptData + 2 * NUM_EXPERTS)
         M_BLOCKS = (M + BLOCK_M - 1) // BLOCK_M
@@ -61,8 +60,6 @@ def _swiglu(out_desc, Out, OutExpectedScale, OutActualScale, OutChecksumScale, a
         # load a
         packed_off_n = pid_n * 2 * BLOCK_N + tl.arange(0, 2 * BLOCK_N)
         packed_offs = off_m[:, None] * stride_am + packed_off_n[None, :] * stride_an
-        if a_desc is not None:
-            a_packed = a_desc.load([pid_m * BLOCK_M, pid_n * 2 * BLOCK_N])
         if EVEN_N:
             a_packed = tl.load(A + packed_offs, mask=mask_m[:, None], other=0.)
         else:
@@ -91,11 +88,7 @@ def _swiglu(out_desc, Out, OutExpectedScale, OutActualScale, OutChecksumScale, a
         out = float_to_flex(out, out_expected_scale,
                             None,  # ActualScale: local absmax is tracked and updated after the loop
                             OutChecksumScale, None, Out, flexpoint_saturate_inf)
-
-        if out_desc is not None:
-            out_desc.store([pid_m * BLOCK_M, pid_n * BLOCK_N], out.to(Out.dtype.element_ty))
-        else:
-            mask = mask_m[:, None] if EVEN_N else mask_m[:, None] and mask_n[None, :]
-            tl.store(Out + off_m[:, None] * stride_outm + off_n[None, :] * stride_outn, out, mask)
+        mask = mask_m[:, None] if EVEN_N else mask_m[:, None] and mask_n[None, :]
+        tl.store(Out + off_m[:, None] * stride_outm + off_n[None, :] * stride_outn, out, mask)
 
     update_scale(local_max, OutActualScale, Out)