Merge commit '607c50cc9fdd2541db88b5a8681164f081dd71ad'

whitneywhtsang · whitneywhtsang · commit 402d57c15420 · 2025-05-23T14:27:37.000Z
diff --git a/Makefile b/Makefile
@@ -36,8 +36,8 @@ test-unit: all
 	$(PYTEST) -s -n 8 python/test/unit/test_debug.py --forked
 	$(PYTEST) -s -n 8 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
-	# Run cuda/test_flashattention.py separately to avoid out of gpu memory
-	$(PYTEST) -s python/test/unit/cuda/test_flashattention.py
+	# Run attention separately to avoid out of gpu memory
+	TRITON_PRINT_AUTOTUNING=1 $(PYTEST) -vs python/tutorials/06-fused-attention.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 
diff --git a/python/test/unit/language/test_standard.py b/python/test/unit/language/test_standard.py
@@ -65,24 +65,26 @@ def sort_kernel(X, stride_xm, Z, stride_zm, M: tl.constexpr, N: tl.constexpr, k:
 
 
 @pytest.mark.interpreter
-@pytest.mark.parametrize("M, N", [[1, 512], [8, 64], [256, 16], [512, 8]])
+@pytest.mark.parametrize("M, N, K", [[1, 16, 64], [8, 2, 256], [32, 1, 2], [128, 8, 1]])
 @pytest.mark.parametrize("dtype_str", ['int32', 'float16', 'float32', 'bfloat16'])
-def test_flip(M, N, dtype_str, device):
+@pytest.mark.parametrize("dim", [0, 1, 2, -2])
+def test_flip(M, N, K, dtype_str, dim, device):
 
     @triton.jit
-    def flip_kernel(X, Z, N: tl.constexpr, M: tl.constexpr):
-        offx = tl.arange(0, M)
-        offy = tl.arange(0, N) * M
-        off2d = offx[None, :] + offy[:, None]
-        x = tl.load(X + off2d)
-        x = tl.flip(x)
-        tl.store(Z + off2d, x)
-
-    x = numpy_random((N, M), dtype_str=dtype_str)
+    def flip_kernel(X, Z, M: tl.constexpr, N: tl.constexpr, K: tl.constexpr, dim: tl.constexpr):
+        offx = tl.arange(0, M) * N * K
+        offy = tl.arange(0, N) * K
+        offz = tl.arange(0, K)
+        off3d = offx[:, None, None] + offy[None, :, None] + offz[None, None, :]
+        x = tl.load(X + off3d)
+        x = tl.flip(x, dim)
+        tl.store(Z + off3d, x)
+
+    x = numpy_random((M, N, K), dtype_str=dtype_str)
     x = torch.from_numpy(x).to(device)
-    y = torch.flip(x, (1, ))
+    y = torch.flip(x, (dim, ))
     z = torch.empty_like(x, device=device)
-    flip_kernel[(1, )](x, z, N, M, num_warps=8)
+    flip_kernel[(1, )](x, z, M, N, K, dim, num_warps=8)
     assert (y == z).all(), (y, z)
 
 
diff --git a/python/test/unit/language/test_tuple.py b/python/test/unit/language/test_tuple.py
@@ -162,3 +162,38 @@ def mul(x, a):
     ty = Tensor(y, y.shape, y.stride())
     _namedtuple_kernel[(1, )](function, tx, ty, 64, 64)
     assert torch.allclose(y, x[:16, :16] * a)
+
+
+@pytest.mark.interpreter
+def test_eq(device):
+
+    @triton.jit
+    def fn(ret_ptrs):
+        tl.store(ret_ptrs + 0, (1, 2) == (1, 2))
+        tl.store(ret_ptrs + 1, (1, 2) == (1, 1))
+        tl.store(ret_ptrs + 2, tl.tuple((1, 2)) == (1, 2))
+        tl.store(ret_ptrs + 3, tl.tuple((1, 2)) == (1, 3))
+
+    rets = torch.zeros((4, ), dtype=torch.int32, device=device)
+    fn[(1, )](rets)
+    assert rets[0].item() == 1
+    assert rets[1].item() == 0
+    assert rets[2].item() == 1
+    assert rets[3].item() == 0
+
+
+@pytest.mark.interpreter
+def test_add(device):
+
+    @triton.jit
+    def fn(ret_ptrs):
+        tuple0 = ((0, 1)) + (2, 3)
+        for i in tl.static_range(4):
+            tl.store(ret_ptrs + i, tuple0[i])
+        tuple1 = tl.tuple((4, 5)) + (6, 7)
+        for i in tl.static_range(4):
+            tl.store(ret_ptrs + 4 + i, tuple1[i])
+
+    rets = torch.zeros((8, ), dtype=torch.int32, device=device)
+    fn[(1, )](rets)
+    torch.testing.assert_close(rets.cpu(), torch.arange(8, dtype=torch.int32))
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -306,6 +306,13 @@ def _unwrap_if_constexpr(o):
     return o.value if isinstance(o, constexpr) else o
 
 
+def _normalize_tuple(t):
+    normalized_tuple = _unwrap_if_constexpr(t)
+    if isinstance(normalized_tuple, (list, builtins.tuple)):
+        normalized_tuple = tuple(normalized_tuple)
+    return normalized_tuple
+
+
 def check_bit_width(value, shift_value):
     if isinstance(value, tensor) and isinstance(shift_value, constexpr):
         bitwidth = value.type.scalar.primitive_bitwidth
@@ -1069,7 +1076,6 @@ def __not__(self, _builder=None):
 
     @builtin
     def __getitem__(self, slices, _builder=None):
-        import builtins
         if isinstance(slices, (builtins.slice, slice, constexpr)) or slices is None:
             slices = [slices]
         if isinstance(slices, tuple):
@@ -1237,7 +1243,7 @@ def flip(self, dim=None) -> tensor:
 
 class tuple(base_value):
 
-    def __init__(self, args: list, type: tuple_type = None):
+    def __init__(self, args: Sequence, type: tuple_type = None):
         self.values = [i for i in args]
 
         def get_type(x):
@@ -1255,7 +1261,6 @@ def __getitem__(self, idx: constexpr):
         if isinstance(idx, constexpr):
             return self.values[idx]
         else:
-            import builtins
             assert isinstance(idx, (slice, builtins.slice))
             return tuple(self.values[idx.start:idx.stop:idx.step])
 
@@ -1270,8 +1275,7 @@ def __setitem__(self, idx: constexpr, value):
         self.values[idx] = value
 
     def __add__(self, other):
-        if isinstance(other, list):
-            other = tuple(other)
+        other = _normalize_tuple(other)
         return tuple(self.values + other.values)
         # return tuple(a + b for a, b in zip(self.values, other.values))
 
@@ -1280,13 +1284,10 @@ def __mul__(self, other):
         return tuple(self.values * other.value)
 
     def __eq__(self, other):
-        import builtins
-        if isinstance(other, (list, builtins.tuple)):
-            other = tuple(other)
+        other = _normalize_tuple(other)
         return constexpr(self.values == other.values)
 
     def __hash__(self):
-        import builtins
         return hash(builtins.tuple(self.values))
 
     def __str__(self):
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -475,7 +475,8 @@ def _get_flip_dim(dim, shape):
     shape = core._unwrap_if_constexpr(shape)
     if dim is None:
         dim = len(shape) - 1
-    assert dim == len(shape) - 1, "Currently only support flipping the last dimension"
+    if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
+        dim += len(shape)
     return core.constexpr(dim)
 
 
@@ -487,20 +488,19 @@ def flip(x, dim=None):
 
     :param x: the first input tensor
     :type x: Block
-    :param dim: the dimension to flip along (currently only final dimension supported)
+    :param dim: the dimension to flip along
     :type dim: int
     """
-    core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
-    core.static_assert(_is_power_of_two(x.numel))
-    # reshape the tensor to have all dimensions be 2.
-    # TODO: We shouldn't have to change the dimensions not sorted.
-    steps: core.constexpr = _log2(x.numel)
-    start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
+    core.static_assert(-len(x.shape) <= dim and dim < len(x.shape))
+    _dim: core.constexpr = _get_flip_dim(dim, x.shape)
+    core.static_assert(_is_power_of_two(x.shape[_dim]))
+    steps: core.constexpr = _log2(x.shape[_dim])
 
+    # reshape the swap dimension to (2, 2, ..., 2)
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
-    for i in core.static_range(start, steps):
-        y = y ^ xor_sum(y, i, True)
+    y = core.reshape(x.to(idtype, bitcast=True), x.shape[:_dim] + [2] * steps + x.shape[_dim + 1:])
+    for i in core.static_range(steps):
+        y = y ^ xor_sum(y, _dim + i, True)
     x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x
 
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -38,7 +38,7 @@ def _query_gpu_specs():
 
     gpu_specs = {
         "NVIDIA H100 80GB HBM3": {"MAX_TFLOPS8": 1979, "MAX_TFLOPS16": 989, "MAX_TBPS": 3.35},
-        "HGX GB200": {"MAX_TFLOPS8": 4500, "MAX_TFLOPS16": 2250, "MAX_TBPS": 8.0},
+        "NVIDIA GB200": {"MAX_TFLOPS8": 4500, "MAX_TFLOPS16": 2250, "MAX_TBPS": 8.0},
         "AMD Instinct MI300X": {"MAX_TFLOPS8": 2615, "MAX_TFLOPS16": 1307, "MAX_TBPS": 5.3},
         "AMD Instinct MI325X": {"MAX_TFLOPS8": 2615, "MAX_TFLOPS16": 1307, "MAX_TBPS": 6.0},
     }
@@ -175,7 +175,8 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     batches = list(chain(*[range(*r) for r in batch_ranges]))
     # collect performance data
     perfs = []
-    print(f"Benchmarking {name} ({x_dtype}x{w_dtype}, TP={TP}, EP={EP})...")
+    bench_case = f"{name} ({x_dtype}x{w_dtype}, TP={TP}, EP={EP})"
+    print(f"Benchmarking {bench_case}...")
     print("===============================================================")
     for batch in batches:
         perfs += [bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP, EP, name)]
@@ -186,7 +187,7 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     fig, ax = plt.subplots(figsize=(7, 5), dpi=120)
     ax.set_xlabel("batch size (toks/expt)")
     ax.set_ylabel("performance  [TFLOP/s]")
-    ax.set_title("roofline")
+    ax.set_title(f"{bench_case} roofline")
     # add a tiny margin so points are not flush with the frame
     xs = [batch * n_expts_act / n_expts_tot for batch in batches]
     perf = [p.tflops for p in perfs]
@@ -200,7 +201,8 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     opints = [p.opint for p in perfs]
     knee = bisect_left(opints, max_tflops / max_tbps) - 1
     x_bw, x_comp = xs[:knee], xs[knee:]
-    y_bw = [op * max_tbps for op in opints[:knee]]
+    x_bw = [x_bw[0], x_comp[0]]
+    y_bw = [opints[0] * max_tbps, max_tflops]
     y_comp = [max_tflops] * len(x_comp)
     ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.0f} TB/s)")
     ax.plot(x_comp, y_comp, "--", label=f"Compute-bound  ({max_tflops:.0f} TFLOP/s)")
@@ -217,10 +219,11 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or get_cdna_version() == 4
     if SPECS is None:
         print("Current GPU has no specs provided, utilization is N/A")
-    batch_ranges = [(1024, 32768, 1024)]
+    batch_ranges_dense = [(1024, 32768, 1024)]
+    batch_ranges_moe = [(128, 512, 32), (512, 32000, 128)]
     dense_dtypes = ["fp8", "fp8"]
     quantized_dtypes = ["fp8", "mx4"] if has_native_mx4 else ["bf16", "mx4"]
-    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
-    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
+    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
+    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -62,7 +62,6 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     HIST_BLOCK_M = 64
     INDX_OFFS_BLOCK_M = 512
     MEMSET_BLOCK = 1024
-    assert logits.dtype.itemsize == 2
     n_tokens, n_expts_tot = logits.shape
     n_gates = n_tokens * n_expts_act
     device = logits.device
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -7,7 +7,6 @@ def topk(x, k, dim=1, return_bitmatrix=True):
     cdiv = lambda a, b: (a + b - 1) // b
     BLOCK_M = 8
     BLOCK_N = 128
-    assert x.dtype.itemsize == 2
     assert x.ndim == 2
     assert x.shape[-1] < 32768
     assert dim == 1
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk.py b/python/triton_kernels/triton_kernels/topk_details/_topk.py
@@ -5,6 +5,10 @@
 @triton.jit
 def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
                    BLOCK_N: tl.constexpr):
+    x_nbits: tl.constexpr = X.dtype.element_ty.primitive_bitwidth
+    x_utype: tl.constexpr = tl.dtype(f"uint{x_nbits}")
+    x_ultype: tl.constexpr = tl.dtype(f"uint{2*x_nbits}")
+    x_dbtype: tl.constexpr = tl.dtype(f"fp{2*x_nbits}")
 
     # subtract 1 from loop iterations because we peel the first (masked) iteration:
     loop_iterations: tl.constexpr = N_EXPTS_PAD // BLOCK_N - 1
@@ -15,8 +19,8 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
     # first iteration:
     X_ptrs = X + offs_m[:, None] * stride_xm + offs_x_n[None, :]
     x = tl.load(X_ptrs, mask=(mask_m & mask_n), other=float("-inf"))
-    x = (x.to(tl.uint16, bitcast=True).to(tl.int32) << 16) | offs_x_n[None, :]
-    x = x.to(tl.float32, bitcast=True)
+    x = (x.to(x_utype, bitcast=True).to(x_ultype) << x_nbits) | offs_x_n[None, :]
+    x = x.to(x_dbtype, bitcast=True)
 
     acc = tl.topk(x, N_EXPTS_ACT, dim=1)
 
@@ -26,8 +30,8 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
         X_ptrs -= BLOCK_N
         offs_x_n -= BLOCK_N
         x = tl.load(X_ptrs, mask=mask_m, other=float("-inf"))
-        x = (x.to(tl.uint16, bitcast=True).to(tl.int32) << 16) | offs_x_n[None, :]
-        x = x.to(tl.float32, bitcast=True)
+        x = (x.to(x_utype, bitcast=True).to(x_ultype) << x_nbits) | offs_x_n[None, :]
+        x = x.to(x_dbtype, bitcast=True)
         acc = tl.maximum(acc, tl.topk(x, N_EXPTS_ACT, dim=1))
 
     return acc
@@ -43,18 +47,21 @@ def _topk(X, stride_xm,  # inputs
     tl.static_assert(BLOCK_N % 32 == 0)
     tl.static_assert(N_EXPTS_PAD % BLOCK_N == 0)
     x_dtype: tl.constexpr = X.dtype.element_ty
+    x_nbits: tl.constexpr = X.dtype.element_ty.primitive_bitwidth
+    x_utype: tl.constexpr = tl.dtype(f"uint{x_nbits}")
+    x_ultype: tl.constexpr = tl.dtype(f"uint{2*x_nbits}")
 
     # load logits
     offs_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
     mask_m = offs_m[:, None] < n_rows
     y = streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD, N_EXPTS_ACT, BLOCK_N)
-    y = y.to(tl.uint32, bitcast=True)
+    y = y.to(x_ultype, bitcast=True)
 
     # sort result in direction of ascending expert index
-    y = (y << 16) | (y >> 16)
+    y = (y << x_nbits) | (y >> x_nbits)
     y = tl.sort(y, dim=1)
-    y_indices = y >> 16
-    y_values = (y & 0x0000FFFF).to(tl.uint16).to(x_dtype, bitcast=True)
+    y_indices = y >> x_nbits
+    y_values = (y & ((1 << x_nbits) - 1)).to(x_utype).to(x_dtype, bitcast=True)
     y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
 
     # write back
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -428,8 +428,9 @@ struct FpToFpOpConversion
              Fp16_to_Fp8E5M2_RTNE(computeCapability >= 89)},
             {{F16TyID, F8E5M2TyID, RoundingMode::RTZ}, Fp16_to_Fp8E5M2_RTZ},
             // F8 -> BF16
+            // mul{.rnd}.bf16 and mul{.rnd}.bf16x2 requires sm_90 or higher.
             {{F8E5M2TyID, BF16TyID, undefRounding},
-             Fp8E5M2_to_Bf16(computeCapability >= 89)},
+             Fp8E5M2_to_Bf16(computeCapability >= 90)},
             {{F8E4M3TyID, BF16TyID, undefRounding},
              Fp8E4M3Nv_to_Bf16(computeCapability >= 89)},
             // BF16 -> F8