Merge OpenAI Triton commit f05cdc4 (#4134)

whitneywhtsang · web-flow · commit fc024d816169 · 2025-05-07T18:52:34.000-04:00
This PR change the Triton base from 553d01d to f05cdc4 (May 5). Pass rate: 94.57%
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -8,7 +8,7 @@ permissions: read-all
 
 jobs:
   Build-Documentation:
-    runs-on: [a100-runner-set]
+    runs-on: [nvidia-a100]
     timeout-minutes: 30
 
     steps:
diff --git a/bench/bench/bench_mlp.py b/bench/bench/bench_mlp.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+import matplotlib.pyplot as plt
 import json
 import triton.profiler as proton
 import torch
@@ -8,6 +9,7 @@
 from triton_bench.numerics import InFlexData
 from triton_bench.routing import routing
 from triton_bench.target_info import is_hip, get_cdna_version
+from dataclasses import dataclass
 
 if torch.cuda.is_available() and not is_hip():
     from triton._C.libtriton import nvidia
@@ -66,9 +68,38 @@ def quantize(w, dtype, dev, **opt):
                                                 actual_weight_scale_shape=weight_scale_shape)
 
 
-def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
-              # tensor / expert parallelism
-              TP=1, EP=1, name=""):
+@dataclass
+class PerfData:
+    time: float
+    flops: float
+    bytes: float
+
+    @property
+    def tflops(self):
+        return self.flops / self.time * 1e-3
+
+    @property
+    def tbps(self):
+        return self.bytes / self.time * 1e-3
+
+    @property
+    def opint(self):
+        # operational intensity
+        assert self.bytes > 0
+        return self.flops / self.bytes
+
+    @property
+    def util(self) -> float:
+        if SPECS is None:
+            return 0.0
+
+        peak_flops = max(SPECS["MAX_TFLOPS8"], SPECS.get("MAX_TFLOPS16", 0))
+        min_t_flop = self.flops / peak_flops * 1e-3  # ns → µs
+        min_t_bw = self.bytes / SPECS["MAX_TBPS"] * 1e-3
+        return max(min_t_flop, min_t_bw) / self.time
+
+
+def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP, EP, name):
     assert n_expts_tot % EP == 0
     assert dim2 % TP == 0
     dev = "cuda"
@@ -96,7 +127,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     pc2 = PrecisionConfig(mx_ctx=w2_mx, flex_ctx=FlexCtx(rhs_data=w2_flex))
 
     # -- benchmark --
-    fpath = Path(f"logs/{name}/{batch}-{dim1}-{dim2}-{n_expts_tot}-{n_expts_act}-{x_dtype}-{w_dtype}.hatchet")
+    fpath = Path(f"logs/{name}/{x_dtype}-{w_dtype}-TP{TP}-EP{EP}/profiles/batch-{batch}.hatchet")
     fpath.parent.mkdir(parents=True, exist_ok=True)
     x_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
     # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
@@ -115,7 +146,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
         else:
             rdata, gather_indx, scatter_indx = None, None, None
         x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1)
-        x = triton_bench.swiglu.swiglu(x, 1.0, pcs)
+        x = triton_bench.swiglu.swiglu(x, 1.0, pcs, routing_data=rdata)
         x = matmul_ogs(x, w2, b2, rdata, scatter_indx=scatter_indx, precision_config=pc2)
     proton.finalize()
 
@@ -127,42 +158,70 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
         matmuls = [
             x for x in data[0]["children"] if "_matmul" in x["frame"]["name"] and "metadata" not in x["frame"]["name"]
         ]
-        tot_bytes = sum([x["metrics"]["bytes"] for x in matmuls])
-        tot_flops = {w: sum([x["metrics"].get(f"flops{w}", 0) for x in matmuls]) for w in [8, 16]}
+        bytes = sum([x["metrics"]["bytes"] for x in matmuls])
+        flops = {w: sum([x["metrics"].get(f"flops{w}", 0) for x in matmuls]) for w in [8, 16]}
+        flops = sum([flops[w] for w in [8, 16]])
         # compute total time (incl. "not useful" work)
         # TODO: proton should really be recording that in the json instead of
         # relying on the user to aggregate
-        tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
-        min_time_flops = min_time_bytes = 0
-        if SPECS is not None:
-            min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
-            min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
-            min_time = max(min_time_flops, min_time_bytes)
-            util = min_time / tot_time
-        else:
-            util = 0.0
-        tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
-        tbps = tot_bytes / tot_time * 1e-3
-        print(f"Utilization: {util:.0%}; {tflops:>6.1f} TFLOPs, {tbps:.1f} TB/s")
-
-    return util, tflops, tbps
+        time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
+    return PerfData(time, flops, bytes)
+
+
+def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP=1, EP=1, name="",
+                 verbose=True):
+    import numpy as np
+    from itertools import chain
+    from bisect import bisect_left
+    batches = list(chain(*[range(*r) for r in batch_ranges]))
+    # collect performance data
+    perfs = []
+    print(f"Benchmarking {name} ({x_dtype}x{w_dtype}, TP={TP}, EP={EP})...")
+    print("===============================================================")
+    for batch in batches:
+        perfs += [bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP, EP, name)]
+        if verbose:
+            print(f"Batch: {batch}; Util: {perfs[-1].util}; TFLOPS: {perfs[-1].tflops}; TBPS: {perfs[-1].tbps}")
+    print("===============================================================")
+    # machine limits
+    fig, ax = plt.subplots(figsize=(7, 5), dpi=120)
+    ax.set_xlabel("batch size (toks/expt)")
+    ax.set_ylabel("performance  [TFLOP/s]")
+    ax.set_title("roofline")
+    # add a tiny margin so points are not flush with the frame
+    xs = [batch * n_expts_act / n_expts_tot for batch in batches]
+    perf = [p.tflops for p in perfs]
+    xmin, xmax = min(xs), max(xs)
+    dx = 0.05 * (xmax - xmin) if xmax > xmin else 1.0
+    ax.set_xlim(xmin - dx, xmax + dx)
+    ax.set_ylim(100, SPECS["MAX_TFLOPS8"] + 500)
+    # plot roofline
+    max_tbps = SPECS["MAX_TBPS"]
+    max_tflops = SPECS["MAX_TFLOPS8"]
+    opints = [p.opint for p in perfs]
+    knee = bisect_left(opints, max_tflops / max_tbps) - 1
+    x_bw, x_comp = xs[:knee], xs[knee:]
+    y_bw = [op * max_tbps for op in opints[:knee]]
+    y_comp = [max_tflops] * len(x_comp)
+    ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.0f} TB/s)")
+    ax.plot(x_comp, y_comp, "--", label=f"Compute-bound  ({max_tflops:.0f} TFLOP/s)")
+    # plot data
+    ax.scatter(xs, perf, marker="+")
+    ax.legend(frameon=False, loc="lower right")
+    ax.grid(True, which="both", ls=":", lw=0.5)
+    fig.tight_layout()
+    fpath = Path(f"logs/{name}/{x_dtype}-{w_dtype}-TP{TP}-EP{EP}/roofline.png")
+    plt.savefig(fpath)
 
 
 if __name__ == "__main__":
     has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or get_cdna_version() == 4
     if SPECS is None:
         print("Current GPU has no specs provided, utilization is N/A")
-    if has_native_mx4:
-        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
-        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "mx4", TP=1, EP=1, name="dense")
-        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
-        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "mx4", TP=4, EP=1, name="llama4")
-    else:
-        # bf16/fp16 x fp8 is skipped because matmul_ogs requires x and w has the
-        # same type when not doing mxfp operation
-        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
-        bench_mlp(8192, 8192, 8192, 1, 1, "fp16", "mx4", TP=1, EP=1, name="dense")
-        bench_mlp(8192, 8192, 8192, 1, 1, "bf16", "mx4", TP=1, EP=1, name="dense")
-        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
-        bench_mlp(2048, 5120, 8192, 128, 4, "bf16", "mx4", TP=4, EP=1, name="llama4")
-        bench_mlp(2048, 5120, 8192, 128, 4, "fp16", "mx4", TP=4, EP=1, name="llama4")
+    batch_ranges = [(1024, 32768, 1024)]
+    dense_dtypes = ["fp8", "fp8"]
+    quantized_dtypes = ["fp8", "mx4"] if has_native_mx4 else ["bf16", "mx4"]
+    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
+    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
diff --git a/bench/triton_bench/matmul_ogs_details/_common.py b/bench/triton_bench/matmul_ogs_details/_common.py
@@ -87,9 +87,10 @@ def matmul_launch_metadata(grid, kernel, args):
     fM = M if M is not None else n_tokens
     fK = K if K is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * fK
-    skipped = 0
+    gindx = args.get("GatherIndx", None)
     sindx = args.get("WriteBackIndx", None)
-    if sindx is not None:
-        skipped = (sindx == -1).sum() / sindx.numel()
-    ret["bytes"] = int((1 - skipped) * Y.numel() * Y.element_size() + X.numel() * X.element_size() + n_w_bytes)
+    sskipped = 0. if sindx is None else (sindx == -1).sum() / sindx.shape[0]
+    gskipped = 0. if gindx is None else (gindx == -1).sum() / gindx.shape[0]
+    ret["bytes"] = int((1 - sskipped) * Y.numel() * Y.element_size() + (1 - gskipped) * X.numel() * X.element_size() +
+                       n_w_bytes)
     return ret
diff --git a/bench/triton_bench/matmul_ogs_details/_ptma_matmul_ogs.py b/bench/triton_bench/matmul_ogs_details/_ptma_matmul_ogs.py
@@ -58,17 +58,13 @@ def _make_tensor_desc(ptr, shape, strides, block_shape, transpose: tl.constexpr
     tl.static_assert(len(shape) == len(strides))
     tl.static_assert(len(strides) == len(block_shape))
     if transpose:
-        # Pass constexpr(1) to workaround torchflow tracer changing values of 1 to 2 during compile.
-        # We check that the stride is actually 1 before launching the kernel.
         return tl.make_tensor_descriptor(
             ptr,
             shape=shape[:-2] + [shape[-1], shape[-2]],
             strides=strides[:-2] + [strides[-1], tl.constexpr(1)],
             block_shape=block_shape[:-2] + [block_shape[-1], block_shape[-2]],
         )
     else:
-        # Pass constexpr(1) to workaround torchflow tracer changing values of 1 to 2 during compile.
-        # We check that the stride is actually 1 before launching the kernel.
         return tl.make_tensor_descriptor(
             ptr,
             shape=shape,
diff --git a/bench/triton_bench/matmul_ogs_details/opt_flags.py b/bench/triton_bench/matmul_ogs_details/opt_flags.py
@@ -59,7 +59,6 @@ def make_default_opt_flags_amd(
         block_m = 128
     elif tokens_per_expt >= 512 and n >= 2048:
         block_m = 128
-
     else:
         block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64))
     if routing_data is not None:
@@ -139,7 +138,7 @@ def make_default_opt_flags_nvidia(
     elif enforce_bitwise_invariance:
         block_m = 128
     else:
-        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+        block_m = max(64, min(triton.next_power_of_2(tokens_per_expt), 128))
     # TODO: remove when triton is more optimized for H100 MXFP4
     arch = None
     if (
diff --git a/bench/triton_bench/swiglu.py b/bench/triton_bench/swiglu.py
@@ -5,6 +5,7 @@
 from triton.tools.tensor_descriptor import TensorDescriptor
 from .swiglu_details._swiglu import _swiglu
 from triton_bench import target_info
+from .matmul_ogs_details.metadata import compute_metadata
 
 
 @dataclass(frozen=True)
@@ -23,7 +24,7 @@ class PrecisionConfig:
 class SwiGLU(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, a, alpha, precision_config, expt_data, num_experts):
+    def forward(ctx, a, alpha, precision_config, routing_data, num_experts):
         N = a.shape[-1]
         M = a.numel() // N
         assert a.stride()[-1] == 1
@@ -48,7 +49,7 @@ def forward(ctx, a, alpha, precision_config, expt_data, num_experts):
         # launch semi-persistent kernel
         N_BLOCKS = triton.cdiv(N // 2, BLOCK_N)
         num_sms = target_info.num_sms()
-        if expt_data is not None:
+        if routing_data is not None:
             waves_per_sm = 32 if target_info.is_hip() else 128
             num_pid = num_sms * (waves_per_sm // num_warps)
             M_BLOCKS = max(1, triton.cdiv(num_pid, N_BLOCKS))
@@ -59,6 +60,9 @@ def forward(ctx, a, alpha, precision_config, expt_data, num_experts):
                 grid = (8 * num_sms, )
             else:
                 grid = (min(M_BLOCKS * N_BLOCKS, 4 * num_sms), )
+        expt_data = None
+        if routing_data is not None:
+            expt_data = compute_metadata(routing_data, M, BLOCK_M).buffer
         _swiglu[grid](
             out_desc,
             flex_ctx.out_data.reinterpret(out),
@@ -91,8 +95,8 @@ def forward(ctx, a, alpha, precision_config, expt_data, num_experts):
         return out
 
 
-def swiglu(a, alpha, precision_config, expt_data=None, num_experts=0):
-    return SwiGLU.apply(a, alpha, precision_config, expt_data, num_experts)
+def swiglu(a, alpha, precision_config, routing_data=None, num_experts=0):
+    return SwiGLU.apply(a, alpha, precision_config, routing_data, num_experts)
 
 
 def swiglu_torch(a, alpha, precision_config):
diff --git a/docs/conf.py b/docs/conf.py
@@ -43,7 +43,7 @@ def get_cmake_dir():
     plat_name = sysconfig.get_platform()
     python_version = sysconfig.get_python_version()
     dir_name = f"cmake.{plat_name}-{sys.implementation.name}-{python_version}"
-    cmake_dir = Path("../python") / "build" / dir_name
+    cmake_dir = Path("../build") / dir_name
     return cmake_dir
 
 
@@ -100,7 +100,7 @@ def setup(app):
     app.connect("autodoc-process-signature", process_sig)
     max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
     print(f"Installing Triton Python package using {max_jobs} threads")
-    subprocess.run("pip install -e ..", shell=True, env=os.environ.copy())
+    subprocess.run("pip install -e ../", shell=True, env=os.environ.copy())
 
     setup_generated_mlir_docs()
 
diff --git a/include/triton/Dialect/Triton/IR/Utility.h b/include/triton/Dialect/Triton/IR/Utility.h
@@ -173,6 +173,10 @@ template <typename T> auto seq(T start, T end, T step) {
                          [=](T i) { return start + i * step; });
 }
 
+// Combine the current mask with the given predicate.
+Value getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,
+                  Value pred);
+
 } // namespace triton
 } // namespace mlir
 
diff --git a/include/triton/Dialect/Triton/Transforms/Utility.h b/include/triton/Dialect/Triton/Transforms/Utility.h
@@ -7,9 +7,6 @@ using namespace mlir;
 
 namespace mlir::triton {
 
-Value getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,
-                  Value pred);
-
 triton::MakeTensorPtrOp getMakeTensorPtrOp(Value v);
 
 } // namespace mlir::triton
diff --git a/lib/Dialect/Triton/IR/CMakeLists.txt b/lib/Dialect/Triton/IR/CMakeLists.txt
@@ -8,6 +8,7 @@ add_triton_library(TritonIR
   Traits.cpp
   Types.cpp
   OpInterfaces.cpp
+  Utility.cpp
 
   DEPENDS
   TritonTableGen
diff --git a/lib/Dialect/Triton/IR/Utility.cpp b/lib/Dialect/Triton/IR/Utility.cpp
@@ -0,0 +1,19 @@
+#include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+using namespace mlir;
+namespace tt = mlir::triton;
+
+Value tt::getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,
+                      Value pred) {
+  Type maskType = tt::getI1SameShape(typeLike);
+  Location loc = pred.getLoc();
+  Value mask = pred;
+  if (isa<RankedTensorType>(maskType)) {
+    mask = rewriter.create<tt::SplatOp>(loc, maskType, pred);
+  }
+  if (currentMask) {
+    mask = rewriter.create<arith::AndIOp>(loc, mask, currentMask);
+  }
+  return mask;
+}
diff --git a/lib/Dialect/Triton/Transforms/LoopInvariantCodeMotion.cpp b/lib/Dialect/Triton/Transforms/LoopInvariantCodeMotion.cpp
@@ -1,7 +1,7 @@
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
-#include "triton/Dialect/Triton/Transforms/Utility.h"
 #include "llvm/Support/Debug.h"
 
 #define GEN_PASS_CLASSES
diff --git a/lib/Dialect/Triton/Transforms/Utility.cpp b/lib/Dialect/Triton/Transforms/Utility.cpp
@@ -6,21 +6,6 @@
 using namespace mlir;
 namespace tt = mlir::triton;
 
-// Combine the current mask with the given predicate.
-Value tt::getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,
-                      Value pred) {
-  Type maskType = tt::getI1SameShape(typeLike);
-  Location loc = pred.getLoc();
-  Value mask = pred;
-  if (isa<RankedTensorType>(maskType)) {
-    mask = rewriter.create<tt::SplatOp>(loc, maskType, pred);
-  }
-  if (currentMask) {
-    mask = rewriter.create<arith::AndIOp>(loc, mask, currentMask);
-  }
-  return mask;
-}
-
 static tt::MakeTensorPtrOp getMakeTensorPtrOpImpl(Operation *op, Value v) {
 
   if (auto makeTensorPtrOp = dyn_cast<tt::MakeTensorPtrOp>(op)) {
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -7,7 +7,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
-#include "triton/Dialect/Triton/Transforms/Utility.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
diff --git a/python/test/unit/conftest.py b/python/test/unit/conftest.py
diff --git a/python/tutorials/08-grouped-gemm.py b/python/tutorials/08-grouped-gemm.py

Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,10 @@ template <typename T> auto seq(T start, T end, T step) {`
`173`	`173`	`[=](T i) { return start + i * step; });`
`174`	`174`	`}`
`175`	`175`
	`176`	`+// Combine the current mask with the given predicate.`
	`177`	`+Value getPredMask(RewriterBase &rewriter, Type typeLike, Value currentMask,`
	`178`	`+ Value pred);`
	`179`	`+`
`176`	`180`	`} // namespace triton`
`177`	`181`	`} // namespace mlir`
`178`	`182`