[TUTORIAL] Cleanup persistent matmul tutorial (triton-lang#6374)

peterbell10 · web-flow · commit 17966f472e25 · 2025-04-02T15:42:58.000-07:00
This changes the tutorial to try both warp-specialized and
non-warp-specialized configs when available. I also updated the
verification and benchmark steps to print out incremental progress
updates so you can tell the scripts hasn't hanged.

Example output from the middle of a run:
```
M=32, N=32, K=32, verification naive vs:
  Torch: ⭕
  cuBLAS: ✅
  Persistent: ✅
  TMA (warp_specialize=False): ✅
  TMA (warp_specialize=True): ✅
  TMA Persistent (warp_specialize=False): ✅
  TMA Persistent (warp_specialize=True): ✅
  Tensor Descriptor Persistent (warp_specialize=False): ✅
  Tensor Descriptor Persistent (warp_specialize=True): ...
```

We should probably rename this advanced-matmul at this point tbh.
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
@@ -20,6 +20,7 @@
 """
 
 import argparse
+import itertools
 
 import torch
 import triton
@@ -46,10 +47,15 @@ def supports_tma():
     return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
+def supports_ws():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 10
+
+
 def _matmul_launch_metadata(grid, kernel, args):
     ret = {}
-    M, N, K = args["M"], args["N"], args["K"]
-    ret["name"] = f"{kernel.name} [M={M}, N={N}, K={K}]"
+    M, N, K, WS = args["M"], args["N"], args["K"], args.get("WARP_SPECIALIZE", False)
+    ws_str = "_ws" if WS else ""
+    ret["name"] = f"{kernel.name}{ws_str} [M={M}, N={N}, K={K}]"
     if "c_ptr" in args:
         bytes_per_elem = args["c_ptr"].element_size()
     else:
@@ -61,6 +67,7 @@ def _matmul_launch_metadata(grid, kernel, args):
 
 HAS_TMA_DESC = supports_tma() and hasattr(tl, "nv_tma_desc_type")
 HAS_TENSOR_DESC = supports_tma() and hasattr(tl, "make_tensor_descriptor")
+HAS_WARP_SPECIALIZE = supports_ws() and HAS_TENSOR_DESC
 
 
 # TmaAutoTuneHelper used in htyu's PR #5622
@@ -197,17 +204,18 @@ def matmul(a, b):
 
 @triton.autotune(
     configs=matmul_get_configs(),
-    key=["M", "N", "K"],
+    key=["M", "N", "K", "WARP_SPECIALIZE"],
 )
 @triton.jit(launch_metadata=_matmul_launch_metadata)
-def matmul_kernel_tma_ws(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
-                         M, N, K,  #
-                         BLOCK_SIZE_M: tl.constexpr,  #
-                         BLOCK_SIZE_N: tl.constexpr,  #
-                         BLOCK_SIZE_K: tl.constexpr,  #
-                         GROUP_SIZE_M: tl.constexpr,  #
-                         FP8_OUTPUT: tl.constexpr,  #
-                         ):
+def matmul_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
+                      M, N, K,  #
+                      BLOCK_SIZE_M: tl.constexpr,  #
+                      BLOCK_SIZE_N: tl.constexpr,  #
+                      BLOCK_SIZE_K: tl.constexpr,  #
+                      GROUP_SIZE_M: tl.constexpr,  #
+                      FP8_OUTPUT: tl.constexpr,  #
+                      WARP_SPECIALIZE: tl.constexpr,  #
+                      ):
     dtype = tl.float8e4nv if FP8_OUTPUT else tl.float16
 
     pid = tl.program_id(axis=0)
@@ -227,7 +235,7 @@ def matmul_kernel_tma_ws(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
 
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
-    for k in tl.range(k_tiles, warp_specialize=True, num_stages=3):
+    for k in tl.range(k_tiles, warp_specialize=WARP_SPECIALIZE):
         offs_k = k * BLOCK_SIZE_K
         a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [BLOCK_SIZE_M, BLOCK_SIZE_K], dtype)
         b = tl._experimental_descriptor_load(b_desc_ptr, [offs_bn, offs_k], [BLOCK_SIZE_N, BLOCK_SIZE_K], dtype)
@@ -240,7 +248,7 @@ def matmul_kernel_tma_ws(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
     tl._experimental_descriptor_store(c_desc_ptr, c, [offs_cm, offs_cn])
 
 
-def matmul_tma_ws(a, b):
+def matmul_tma(a, b, warp_specialize: bool):
     # Check constraints.
     assert a.shape[1] == b.shape[1], "Incompatible dimensions"  # b is transposed
     assert a.dtype == b.dtype, "Incompatible dtypes"
@@ -296,10 +304,11 @@ def grid(META):
     desc_b = desc_helper.get_tma_descriptor_kernel_param("b")
     desc_c = desc_helper.get_tma_descriptor_kernel_param("c")
 
-    matmul_kernel_tma_ws[grid](
+    matmul_kernel_tma[grid](
         desc_a, desc_b, desc_c,  #
         M, N, K,  #
         FP8_OUTPUT=dtype == torch.float8_e4m3fn,  #
+        WARP_SPECIALIZE=warp_specialize,  #
     )
     return c
 
@@ -402,19 +411,23 @@ def matmul_persistent(a, b):
 
 def matmul_tma_persistent_get_configs():
     return [
-        triton.Config({'BLOCK_SIZE_M': BM, 'BLOCK_SIZE_N': BN, "BLOCK_SIZE_K" : BK, "GROUP_SIZE_M" : 8, "EPILOGUE_SUBTILE" : SUBTILE}, num_stages=s, num_warps=w) \
-        for BM in [128] \
-        for BN in [128, 256] \
-        for BK in [64, 128] \
-        for s in ([2, 3, 4]) \
-        for w in [4, 8] \
-        for SUBTILE in [True, False] \
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': BM, 'BLOCK_SIZE_N': BN, "BLOCK_SIZE_K": BK, "GROUP_SIZE_M": 8, "EPILOGUE_SUBTILE":
+                SUBTILE
+            }, num_stages=s, num_warps=w)  #
+        for BM in [128]  #
+        for BN in [128, 256]  #
+        for BK in [64, 128]  #
+        for s in ([2, 3, 4])  #
+        for w in [4, 8]  #
+        for SUBTILE in [True, False]  #
     ]
 
 
 @triton.autotune(
     configs=matmul_tma_persistent_get_configs(),
-    key=["M", "N", "K"],
+    key=["M", "N", "K", "WARP_SPECIALIZE"],
 )
 @triton.jit(launch_metadata=_matmul_launch_metadata)
 def matmul_kernel_tma_persistent(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
@@ -425,7 +438,9 @@ def matmul_kernel_tma_persistent(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
                                  GROUP_SIZE_M: tl.constexpr,  #
                                  FP8_OUTPUT: tl.constexpr,  #
                                  EPILOGUE_SUBTILE: tl.constexpr,  #
-                                 NUM_SMS: tl.constexpr):  #
+                                 NUM_SMS: tl.constexpr,  #
+                                 WARP_SPECIALIZE: tl.constexpr,  #
+                                 ):
     dtype = tl.float8e4nv if FP8_OUTPUT else tl.float16
     start_pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
@@ -439,7 +454,7 @@ def matmul_kernel_tma_persistent(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
     # Enable warp specialization to leverage async warp scheduling in the GPU.
     # FIXME: This only works on Blackwell right now. On older GPUs, this will
     # use software pipelining.
-    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=True):
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE):
         pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
         offs_am = pid_m * BLOCK_SIZE_M
         offs_bn = pid_n * BLOCK_SIZE_N
@@ -473,7 +488,7 @@ def matmul_kernel_tma_persistent(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
             tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am_c, offs_bn_c])
 
 
-def matmul_tma_persistent(a, b):
+def matmul_tma_persistent(a, b, warp_specialize: bool):
     # Check constraints.
     assert a.shape[1] == b.shape[1], "Incompatible dimensions"  # b is transposed
     assert a.dtype == b.dtype, "Incompatible dtypes"
@@ -542,13 +557,14 @@ def grid(META):
         M, N, K,  #
         FP8_OUTPUT=dtype == torch.float8_e4m3fn,  #
         NUM_SMS=NUM_SMS,  #
+        WARP_SPECIALIZE=warp_specialize,  #
     )
     return c
 
 
 @triton.autotune(
     configs=matmul_tma_persistent_get_configs(),
-    key=["M", "N", "K"],
+    key=["M", "N", "K", "WARP_SPECIALIZE"],
 )
 @triton.jit(launch_metadata=_matmul_launch_metadata)
 def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
@@ -558,7 +574,9 @@ def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
                                         BLOCK_SIZE_K: tl.constexpr,  #
                                         GROUP_SIZE_M: tl.constexpr,  #
                                         EPILOGUE_SUBTILE: tl.constexpr,  #
-                                        NUM_SMS: tl.constexpr):  #
+                                        NUM_SMS: tl.constexpr,  #
+                                        WARP_SPECIALIZE: tl.constexpr,  #
+                                        ):
     # Matmul using TMA and device-side descriptor creation
     dtype = c_ptr.dtype.element_ty
     start_pid = tl.program_id(axis=0)
@@ -591,7 +609,7 @@ def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
     tile_id_c = start_pid - NUM_SMS
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
 
-    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE):
         pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS)
         offs_am = pid_m * BLOCK_SIZE_M
         offs_bn = pid_n * BLOCK_SIZE_N
@@ -621,7 +639,7 @@ def matmul_kernel_descriptor_persistent(a_ptr, b_ptr, c_ptr,  #
             c_desc.store([offs_cm, offs_cn], c)
 
 
-def matmul_descriptor_persistent(a, b):
+def matmul_descriptor_persistent(a, b, warp_specialize: bool):
     # Check constraints.
     assert a.shape[1] == b.shape[1], "Incompatible dimensions"  # b is transposed
     assert a.dtype == b.dtype, "Incompatible dtypes"
@@ -644,6 +662,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
         a, b, c,  #
         M, N, K,  #
         NUM_SMS=NUM_SMS,  #
+        WARP_SPECIALIZE=warp_specialize,  #
     )
     return c
 
@@ -683,12 +702,14 @@ def proton_context():
         proton.deactivate(0)
 
 
-def bench_fn(reps, warmup_reps, fn, *args):
+def bench_fn(label, reps, warmup_reps, fn, *args):
+    print(f"Benchmarking {label}: ...", end="")
     for _ in range(warmup_reps):
         fn(*args)
     with proton_context():
         for _ in range(reps):
             fn(*args)
+    print(f"\rBenchmarking {label}: done")
 
 
 def bench(K, dtype, reps=10000, warmup_reps=10000):
@@ -700,60 +721,55 @@ def bench(K, dtype, reps=10000, warmup_reps=10000):
     b = b.T.contiguous()
 
     if cublas is not None:
-        bench_fn(reps, warmup_reps, cublas_matmul, a, b)
+        bench_fn("cublas", reps, 1, cublas_matmul, a, b)
     if dtype == torch.float16:
-        bench_fn(reps, warmup_reps, torch_matmul, a, b)
-    bench_fn(reps, warmup_reps, matmul, a, b.T)
-    bench_fn(reps, warmup_reps, matmul_persistent, a, b.T)
-    if HAS_TMA_DESC:
-        bench_fn(reps, warmup_reps, matmul_tma_persistent, a, b)
-    if HAS_TENSOR_DESC:
-        bench_fn(reps, warmup_reps, matmul_descriptor_persistent, a, b)
-        bench_fn(reps, warmup_reps, matmul_tma_ws, a, b)
+        bench_fn("torch", reps, warmup_reps, torch_matmul, a, b)
+    bench_fn("naive", reps, warmup_reps, matmul, a, b.T)
+    bench_fn("persistent", reps, warmup_reps, matmul_persistent, a, b.T)
+    warp_specialize = [False, True] if HAS_WARP_SPECIALIZE else [False]
+    for ws in warp_specialize:
+        ws_str = "_ws" if ws else ""
+        if HAS_TMA_DESC:
+            bench_fn(f"tma_persistent{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma_persistent(a, b, ws), a, b)
+            bench_fn(f"tma{ws_str}", reps, warmup_reps, lambda a, b: matmul_tma(a, b, ws), a, b)
+        if HAS_TENSOR_DESC:
+            bench_fn(f"descriptor_persistent{ws_str}", reps, warmup_reps,
+                     lambda a, b: matmul_descriptor_persistent(a, b, ws), a, b)
+
+
+def run_test(expect, fn, a, b, label, enabled=True):
+    print(f"  {label}: ...", end="")
+    if enabled:
+        actual = fn(a, b)
+        passed = torch.allclose(expect, actual.to(expect.dtype), atol=1.0)
+        icon = "✅" if passed else "❌"
+    else:
+        icon = "⭕"
+    print(f"\r  {label}: {icon}  ")
 
 
 def validate(M, N, K, dtype):
+    print(f"{M=}, {N=}, {K=}, verification naive vs: ")
     a = torch.randn((M, K), device="cuda", dtype=torch.float16).to(dtype)
     b = torch.randn((K, N), device="cuda", dtype=torch.float16).to(dtype)
     b = b.T.contiguous()
 
-    torch_result = torch_matmul(a, b) if dtype == torch.float16 else None
-    cublas_result = cublas_matmul(a, b) if cublas is not None else None
-    naive_result = matmul(a, b.T)
-    tma_ws_result = matmul_tma_ws(a, b) if HAS_TENSOR_DESC else None
-    persistent_result = matmul_persistent(a, b.T)
-    tma_persistent_result = matmul_tma_persistent(a, b) if HAS_TMA_DESC else None
-    descriptor_persistent_result = matmul_descriptor_persistent(a, b) if HAS_TENSOR_DESC else None
-
-    if tma_ws_result is not None:
-        naive_vs_tma_ws = "✅" if torch.allclose(naive_result.to(torch.float16), tma_ws_result.to(torch.float16),
-                                                atol=1.0) else "❌"
-    if torch_result is not None:
-        naive_vs_torch = "✅" if torch.allclose(naive_result.to(torch.float16), torch_result.to(torch.float16),
-                                               atol=1.0) else "❌"
-    if cublas_result is not None:
-        naive_vs_cublas = "✅" if torch.allclose(naive_result.to(torch.float16), cublas_result.to(torch.float16),
-                                                atol=1.0) else "❌"
-    naive_vs_persistent = "✅" if torch.allclose(naive_result.to(torch.float16), persistent_result.to(torch.float16),
-                                                atol=1.0) else "❌"
-    if tma_persistent_result is not None:
-        naive_vs_tma_persistent = "✅" if torch.allclose(cublas_result.to(torch.float16),
-                                                        tma_persistent_result.to(torch.float16), atol=1.0) else "❌"
-    if descriptor_persistent_result is not None:
-        naive_vs_descriptor_persistent = "✅" if torch.allclose(cublas_result.to(
-            torch.float16), descriptor_persistent_result.to(torch.float16), atol=1.0) else "❌"
-    print(f"M={M}, N={N}, K={K} verification naive vs: ", end="")
-    if tma_ws_result is not None:
-        print(f"tma: {naive_vs_tma_ws} ", end="")
-    if torch_result is not None:
-        print(f"torch: {naive_vs_torch} ", end="")
-    if cublas_result is not None:
-        print(f"cublas: {naive_vs_cublas} ", end="")
-    print(f"persistent: {naive_vs_persistent} ", end="")
-    if tma_persistent_result is not None:
-        print(f"TMA persistent: {naive_vs_tma_persistent} ", end="")
-    if descriptor_persistent_result is not None:
-        print(f"Tensor descriptor persistent: {naive_vs_descriptor_persistent} ", end="")
+    naive_result = matmul(a, b.T).to(torch.float16)
+    run_test(naive_result, torch_matmul, a, b, "Torch", enabled=dtype == torch.float16)
+    run_test(naive_result, cublas_matmul, a, b, "cuBLAS", enabled=cublas is not None)
+    run_test(naive_result, matmul_persistent, a, b.T, "Persistent")
+
+    kernels = [
+        (matmul_tma, "TMA", HAS_TMA_DESC),
+        (matmul_tma_persistent, "TMA Persistent", HAS_TMA_DESC),
+        (matmul_descriptor_persistent, "Tensor Descriptor Persistent", HAS_TENSOR_DESC),
+    ]
+    warp_specialize = [False, True] if HAS_WARP_SPECIALIZE else [False]
+
+    for (kernel, label, enabled), warp_specialize in itertools.product(kernels, warp_specialize):
+        label = f"{label} (warp_specialize={warp_specialize})"
+        enabled = enabled and (not warp_specialize or HAS_TENSOR_DESC)
+        run_test(naive_result, lambda a, b: kernel(a, b, warp_specialize), a, b, label, enabled)
     print()