Merge commit '981e987eed9053b952f81153bc0779c99d8c642e'

anmyachev · anmyachev · commit e120c12d6ffb · 2025-04-23T17:03:41.000+02:00
diff --git a/bench/tests/test_routing.py b/bench/tests/test_routing.py
@@ -83,6 +83,11 @@ def bench_routing():
         tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act)
         tri_metadata = compute_metadata(tri_routing_data, n_tokens * n_expts_act, block_m)
     proton.finalize()
+    try:
+        import os
+        os.system("proton-viewer -m time/ms routing.hatchet")
+    except:
+        pass
 
 
 if __name__ == "__main__":
diff --git a/bench/triton_bench/routing.py b/bench/triton_bench/routing.py
@@ -13,10 +13,9 @@ def _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size,  # histog
         offs_n = i * BLOCK_N + tl.arange(0, BLOCK_N)
         mask_n = offs_n < hist_size
         hist2 = tl.load(ExpertHist + offs_n, mask=mask_n)
-        tok_starts = tl.cumsum(hist2, 0) + x
+        tok_starts = tl.cumsum(hist2, 0) - hist2 + x
         x += tl.sum(hist2, 0)
-        tl.store(FinalExpertOffs, 0)
-        tl.store(FinalExpertOffs + 1 + offs_n, tok_starts, mask=mask_n)
+        tl.store(FinalExpertOffs + offs_n, tok_starts, mask=mask_n)
         offs_n += BLOCK_N
 
 
@@ -52,51 +51,33 @@ def _keyed_add(x, y):
 
 
 @triton.jit
-def _count_previous(x):
-    """
-    Input  x : uint16[..., N]
-    Output y : uint32[..., N]
-    semantics : y[..., i] = sum_j((x[..., j] == x[..., i]) & (j < i))
-    credits: @apgoucher
-    """
+def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, n_gates,
+                          BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr):
 
-    BLOCK_N: tl.constexpr = x.shape[-1]  # summation axis
-    BATCHES: tl.constexpr = x.numel // BLOCK_N  # number of batches
+    pid_m = tl.program_id(0)
 
-    # reduce to two-dimensional case:
-    y = tl.reshape(x, [BATCHES, BLOCK_N]).to(tl.uint32)
+    tl.static_assert(N_EXPTS_ACT * BLOCK_M <= 32768)
 
-    tl.static_assert(BLOCK_N <= 32768, "compute_run_lengths requires axis to have length <= 32768")
+    local_offs = tl.arange(0, N_EXPTS_ACT * BLOCK_M)
+    offs = pid_m * BLOCK_M * N_EXPTS_ACT + local_offs
+    expert = tl.load(ExptIndx + offs, mask=(offs < n_gates), other=-1).to(tl.uint32)
 
-    # sort (expert, position) ordered pairs to perform an argsort:
-    kv_pairs = ((y << 16) | tl.arange(0, BLOCK_N)[None, :]).to(tl.uint32)
-    sorted_kv_pairs = tl.sort(kv_pairs, 1)
+    # stable-sort by expert ID:
+    kv_pairs = ((expert << 16) | local_offs).to(tl.uint32)
+    kv_pairs = tl.sort(kv_pairs, 0)
+    expert = kv_pairs >> 16
+    offs = pid_m * BLOCK_M * N_EXPTS_ACT + (kv_pairs & 0xffff)
+    mask = expert != 0xffff
+    gate_scal = tl.load(ExptScal + offs, mask=mask)
 
     # compute run lengths in expert-sorted order:
-    x = (sorted_kv_pairs & 0xffff0000 | 0x00000001)
-    expts_and_inclusive_run_lengths = tl.associative_scan(x, 1, _keyed_add)
+    x = (kv_pairs & 0xffff0000 | 0x00000001)
+    expts_and_inclusive_run_lengths = tl.associative_scan(x, 0, _keyed_add)
     exclusive_run_lengths = (expts_and_inclusive_run_lengths - 1) & 0xffff
 
-    # undo permutation by doing another sort
-    # TODO rewrite this when tl.scatter becomes available
-    kv_pairs = ((sorted_kv_pairs << 16) | exclusive_run_lengths).to(tl.uint32)
-    unsorted_run_lengths = tl.sort(kv_pairs) & 0xffff
-
-    res = tl.reshape(unsorted_run_lengths, x.shape)
-    return res
-
+    gates = tl.load(PartialOffs + pid_m * stride_pm + expert, mask=(expert != 0xffff))
+    gates += exclusive_run_lengths
 
-@triton.jit
-def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, n_gates,
-                          BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr):
-    pid_m = tl.program_id(0)
-    offs = pid_m * BLOCK_M * N_EXPTS_ACT + tl.arange(0, N_EXPTS_ACT * BLOCK_M)
-    mask = offs < n_gates
-    indx = tl.load(ExptIndx + offs, mask=mask)
-    mask = mask & (indx != -1)
-    gates = tl.load(PartialOffs + pid_m * stride_pm + indx, mask=mask)
-    gates += tl.reshape(_count_previous(indx), [BLOCK_M * N_EXPTS_ACT])
-    gate_scal = tl.load(ExptScal + offs, mask=mask)
     tl.store(ScatterIndx + offs, gates, mask=mask)
     tl.store(GatherIndx + gates, offs, mask=mask)
     tl.store(GateScal + gates, gate_scal, mask=mask)
@@ -117,15 +98,16 @@ def _routing_clear_bitmatrix(Bitmatrix, stride_bm, shape_bn, cutoff, BLOCK_N: tl
 
 
 @triton.jit
-def _routing_memset_indx(Indx0, Indx1, size, sentinel, BLOCK: tl.constexpr):
+def _routing_memset_indx(Indx, size, sentinel, BLOCK: tl.constexpr, ExpertHist, FinalExpertOffs, hist_size,
+                         BLOCK_N: tl.constexpr):
     pid = tl.program_id(0)
-    buf = tl.program_id(1)
-    offs = pid * BLOCK + tl.arange(0, BLOCK)
-    mask = offs < size
-    if buf == 0:
-        tl.store(Indx0 + offs, sentinel, mask=mask)
-    if buf == 1:
-        tl.store(Indx1 + offs, sentinel, mask=mask)
+
+    if pid == 0:
+        _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size, BLOCK_N)
+    else:
+        offs = (pid - 1) * BLOCK + tl.arange(0, BLOCK)
+        mask = offs < size
+        tl.store(Indx + offs, sentinel, mask=mask)
 
 
 @dataclass
@@ -204,22 +186,15 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     # perform compaction to update expt_scal / expt_indx
     hist, partial_hist = sum(bitmatrix, partials_block_size=HIST_BLOCK_M, dim=0)
     # scratchpad
-    expt_offs = torch.empty(n_expts_tot + 1, dtype=torch.int32, device=device)
+    expt_offs = torch.empty(n_expts_tot, dtype=torch.int32, device=device)
     indx_offs = torch.empty((cdiv(n_tokens, HIST_BLOCK_M), n_expts_tot), dtype=torch.int32, device=device)
+    combined_indx = torch.empty(n_gates * 2, dtype=torch.int32, device=device)
     # output
-    topk_indx = torch.empty(n_gates, dtype=torch.int32, device=device)
-    gate_indx = torch.empty(n_gates, dtype=torch.int32, device=device)
+    topk_indx = combined_indx[:n_gates]
+    gate_indx = combined_indx[n_gates:]
     gate_scal = torch.empty(n_gates, dtype=logits.dtype, device=device)
-    _routing_memset_indx[(cdiv(n_gates, MEMSET_BLOCK), 2)](
-        topk_indx,
-        gate_indx,
-        n_gates,
-        -1,
-        BLOCK=MEMSET_BLOCK,
-    )
-    _routing_compute_expt_offs[(1, )](
-        hist, expt_offs, hist.shape[0], BLOCK_N=512  # tunable parameters
-    )
+    _routing_memset_indx[(cdiv(n_gates * 2, MEMSET_BLOCK) + 1, )](combined_indx, n_gates * 2, -1, MEMSET_BLOCK, hist,
+                                                                  expt_offs, hist.shape[0], BLOCK_N=512)
     _routing_compute_indx_offs[(n_expts_tot, )](
         expt_offs, partial_hist,  # inputs
         indx_offs, partial_hist.shape[0], partial_hist.stride(0),  # outputs
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -399,9 +399,10 @@ def test_mxfp(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, nonKDim, NUM_WARPS
     if not is_cuda():
         return
 
-    # Pipelining of dot_scaled requires tmem_copy to be used, which in turn
-    # requires the scales to be in the blocked layout in global memory.
-    assert out.asm["ttgir"].count("ttng.tc_gen5_mma") == 1
+    if is_cuda():
+        # Pipelining of dot_scaled requires tmem_copy to be used, which in turn
+        # requires the scales to be in the blocked layout in global memory.
+        assert out.asm["ttgir"].count("ttng.tc_gen5_mma") == 1
 
 
 def _knob_promote_lhs_to_tmem(monkeypatch):
diff --git a/test/Conversion/amd/fp_to_fp.mlir b/test/Conversion/amd/fp_to_fp.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck --check-prefix=GFX942 %s
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck --check-prefix=GFX950 %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck --check-prefixes=COMMON,GFX942 %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck --check-prefixes=COMMON,GFX950 %s
 
 //  CHECK-LABEL: f16_to_f32
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
@@ -32,15 +32,30 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // GFX942-COUNT-8: llvm.fptrunc %{{.+}} : f32 to f16
     // GFX950-COUNT-4: llvm.fptrunc %{{.+}} : vector<2xf32> to vector<2xf16>
     %0 = tt.fp_to_fp %arg0, rounding = rtne : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
-    // GFX942-COUNT-4: rocdl.cvt.pkrtz
-    // GFX950-COUNT-4: rocdl.cvt.pkrtz
+    // COMMON-COUNT-4: rocdl.cvt.pkrtz
     %1 = tt.fp_to_fp %arg0, rounding = rtz : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
     tt.return
   }
 }
 
 // -----
 
+//  CHECK-LABEL: f32_to_f16_single_value
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @f32_to_f16_single_value(%arg0: tensor<1x128xf32, #blocked>) {
+    // COMMON: llvm.fptrunc %{{.+}} : f32 to f16
+    // COMMON-NOT: llvm.fptrunc
+    %0 = tt.fp_to_fp %arg0, rounding = rtne : tensor<1x128xf32, #blocked> -> tensor<1x128xf16, #blocked>
+    // COMMON: rocdl.cvt.pkrtz
+    // COMMON-NOT: rocdl.cvt.pkrtz
+    %1 = tt.fp_to_fp %arg0, rounding = rtz : tensor<1x128xf32, #blocked> -> tensor<1x128xf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
 //  CHECK-LABEL: downcast_to_f8
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
diff --git a/test/TritonGPU/amd/mfma-double-rate.mlir b/test/TritonGPU/amd/mfma-double-rate.mlir
@@ -61,8 +61,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 
 // -----
 
-// When kWidth is set to 4, generate single rated mfma instructions.
-// In a future PR, such cases will still generate double rated mfma instructions with kWidth = 4.
+// When kWidth is set to 4, still generate double rated mfma instructions.
 
 // CHECK-LABEL:mfma_16x16x32_f16
 
@@ -74,7 +73,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
       %q: tensor<128x128xf16, #dotOp0>,
       %k: tensor<128x128xf16, #dotOp1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
-    // CHECK: rocdl.mfma.f32.16x16x16f16 {{.*}} : (vector<4xf16>, vector<4xf16>
+    // CHECK: rocdl.mfma.f32.16x16x32.f16 {{.*}} : (vector<8xf16>, vector<8xf16>
     %qk = tt.dot %q, %k, %cst : tensor<128x128xf16, #dotOp0> * tensor<128x128xf16, #dotOp1> -> tensor<128x128xf32, #mma>
     tt.return
  }
@@ -92,7 +91,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
       %q: tensor<128x128xbf16, #dotOp0>,
       %k: tensor<128x128xbf16, #dotOp1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
-    // CHECK: rocdl.mfma.f32.16x16x16bf16.1k {{.*}} : (vector<4xi16>, vector<4xi16>
+    // CHECK: rocdl.mfma.f32.16x16x32.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>
     %qk = tt.dot %q, %k, %cst : tensor<128x128xbf16, #dotOp0> * tensor<128x128xbf16, #dotOp1> -> tensor<128x128xf32, #mma>
     tt.return
  }
@@ -110,7 +109,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
       %q: tensor<128x128xf16, #dotOp0>,
       %k: tensor<128x128xf16, #dotOp1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
-    // CHECK: rocdl.mfma.f32.32x32x8f16 {{.*}} : (vector<4xf16>, vector<4xf16>
+    // CHECK: rocdl.mfma.f32.32x32x16.f16 {{.*}} : (vector<8xf16>, vector<8xf16>
     %qk = tt.dot %q, %k, %cst : tensor<128x128xf16, #dotOp0> * tensor<128x128xf16, #dotOp1> -> tensor<128x128xf32, #mma>
     tt.return
  }
@@ -128,7 +127,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
       %q: tensor<128x128xbf16, #dotOp0>,
       %k: tensor<128x128xbf16, #dotOp1>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
-    // CHECK: rocdl.mfma.f32.32x32x8bf16.1k {{.*}} : (vector<4xi16>, vector<4xi16>
+    // CHECK: rocdl.mfma.f32.32x32x16.bf16 {{.*}} : (vector<8xbf16>, vector<8xbf16>
     %qk = tt.dot %q, %k, %cst : tensor<128x128xbf16, #dotOp0> * tensor<128x128xbf16, #dotOp1> -> tensor<128x128xf32, #mma>
     tt.return
  }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp