[KERNELS] Enable 4-way epilogue subtiling for _p_matmul_ogs. (#7056)

yongjik · web-flow · commit 24863d62b3b4 · 2025-06-04T14:59:05.000-07:00
* This uses PR #7044.

* Also streamlined "expensive epilogue" handling so that we can specify
how expensive it is.
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -148,7 +148,7 @@ class Case:
     n_expt_shards: int = 1
     split_k: int = 1
     hbm_swizzling: bool = False
-    epilogue_subtile: Union[bool, None] = None
+    epilogue_subtile: Union[int, None] = None
 
 
 @pytest.mark.parametrize(
@@ -171,8 +171,9 @@ class Case:
             Case(300, 400, 400, "ragged", "float16", "float16"),
             Case(300, 400, 400, "ragged", "float8_e5m2", "float8_e5m2"),
             Case(1000, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 3, 1),
-            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=False),
-            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=True),
+            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=1),
+            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=2),
+            Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, epilogue_subtile=4),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 2, n_expt_shards=2),
             Case(600, 400, 400, "ragged", "float8_e5m2", "float8_e5m2", 4, 1, n_expt_shards=2),
@@ -424,9 +425,9 @@ def round_x(x, idx):
     (True, True, True),
 ])
 @pytest.mark.parametrize("is_persistent, epilogue_subtile", [
-    (False, False),
-    (True, False),
-    (True, True),
+    (False, None),
+    (True, 1),
+    (True, 4),
 ])
 @pytest.mark.parametrize("swiglu_alpha, swiglu_limit", [
     (1.1, 1.4),
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -43,7 +43,7 @@ class Epilogue:
     specs: FnSpecs
     fn_arg_values_matmul: tuple[object]
     fn_arg_values_finalize: tuple[object]
-    is_expensive: bool = False
+    effective_itemsize: float | None = None
 
 
 EpilogueSpecs = FnSpecs  # TODO: remove this alias when callers are updated
@@ -564,7 +564,7 @@ def matmul_ogs(x, w, bias,
         M, N, K, routing_data,
         can_use_persistent_tma(x, w, gather_indx, precision_config),
         can_use_fused_scatter(scatter_indx, fused_activation),
-        epilogue.is_expensive,
+        epilogue.effective_itemsize,
     )
     # compute grid size
     if not is_input_batched:
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -85,7 +85,10 @@ def matmul_launch_metadata(grid, kernel, args):
     batch_repr = ""
     if "batch_size" in args and args["batch_size"] > 1:
         batch_repr = repr("B", args["batch_size"]) + ", "
-    ret["name"] = f"{kernel.name} [{batch_repr}{repr('M', M)}, {repr('N', N)}, {repr('K', K)}]"
+    ret["name"] = f"{kernel.name} [{batch_repr}{repr('M', M)}, {repr('N', N)}, {repr('K', K)}] stg{kernel.num_stages}"
+    ep_subtile = args["EPILOGUE_SUBTILE"]
+    if ep_subtile is not None and ep_subtile > 1:
+        ret["name"] += f" ep/{ep_subtile}"
     fM = M if M is not None else n_tokens
     fK = K if K is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * fK
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -199,7 +199,11 @@ def _p_matmul_ogs(
     HAS_FUSED_SCATTER: tl.constexpr = WriteBackIndx is not None
     index_type: tl.constexpr = tl.int64
 
-    EPILOGUE_BLOCK_N: tl.constexpr = BLOCK_N // 2 if EPILOGUE_SUBTILE else BLOCK_N
+    if EPILOGUE_SUBTILE is None:
+        SUBTILE_FACTOR: tl.constexpr = 1
+    else:
+        SUBTILE_FACTOR: tl.constexpr = EPILOGUE_SUBTILE
+    EPILOGUE_BLOCK_N: tl.constexpr = BLOCK_N // SUBTILE_FACTOR
     OUT_BLOCK_N: tl.constexpr = EPILOGUE_BLOCK_N // ACTIVATION_REDUCTION_N
     yN = N // ACTIVATION_REDUCTION_N
 
@@ -500,12 +504,26 @@ def _p_matmul_ogs(
         else:
             w_scale = load_scale(WScale)
 
-        if EPILOGUE_SUBTILE:
-            accs = tl.split(tl.permute(tl.reshape(acc, (BLOCK_M, 2, EPILOGUE_BLOCK_N)), (0, 2, 1)))
-            biases = tl.split(tl.permute(tl.reshape(bias, (2, EPILOGUE_BLOCK_N)), (1, 0)))
-        else:
-            accs = (acc,)
-            biases = (bias,)
+        accs = (acc,)
+        biases = (bias,)
+
+        if SUBTILE_FACTOR >= 2:
+            acc0, acc1 = acc.reshape(BLOCK_M, 2, BLOCK_N // 2).permute(0, 2, 1).split()
+            accs = (acc0, acc1)
+            bias0, bias1 = bias.reshape(2, BLOCK_N // 2).permute(1, 0).split()
+            biases = (bias0, bias1)
+
+        if SUBTILE_FACTOR >= 4:
+            acc00, acc01 = acc0.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
+            acc10, acc11 = acc1.reshape(BLOCK_M, 2, BLOCK_N // 4).permute(0, 2, 1).split()
+            accs = (acc00, acc01, acc10, acc11)
+            bias00, bias01 = bias0.reshape(2, BLOCK_N // 4).permute(1, 0).split()
+            bias10, bias11 = bias1.reshape(2, BLOCK_N // 4).permute(1, 0).split()
+            biases = (bias00, bias01, bias10, bias11)
+
+        tl.static_assert(EPILOGUE_BLOCK_N == BLOCK_N // SUBTILE_FACTOR)
+        tl.static_assert(len(accs) == SUBTILE_FACTOR)
+        tl.static_assert(len(biases) == SUBTILE_FACTOR)
 
         for a_i in tl.static_range(len(accs)):
             acc_tile = accs[a_i]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -20,7 +20,7 @@ class OptFlags:
     split_k: int
     fused_scatter: bool
     is_persistent: bool
-    epilogue_subtile: bool
+    epilogue_subtile: int | None
     arch: str
     target_kernel_kwargs: dict
 
@@ -43,7 +43,7 @@ def make_default_opt_flags_amd(
     can_use_persistent_tma,
     can_use_fused_scatter,
     enforce_bitwise_invariance,
-    has_expensive_epilogue,
+    epilogue_effective_itemsize,
     constraints,
 ):
     constraints_supported = ["block_m", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile"]
@@ -106,7 +106,7 @@ def make_default_opt_flags_amd(
     if constraints.get("epilogue_subtile", None) is not None:
         epilogue_subtile = constraints["epilogue_subtile"]
     else:
-        epilogue_subtile = False
+        epilogue_subtile = None
     # AMD-specific
     target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1}
     ret = OptFlags(
@@ -142,10 +142,10 @@ def make_default_opt_flags_nvidia(
     can_use_persistent_tma,
     can_use_fused_scatter,
     enforce_bitwise_invariance,
-    has_expensive_epilogue,
+    epilogue_effective_itemsize,
     constraints,
 ):
-    constraints_supported = ["block_m", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile"]
+    constraints_supported = ["block_m", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile", "num_stages"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
     # tokens per expert
     if routing_data is None:
@@ -175,7 +175,7 @@ def make_default_opt_flags_nvidia(
     if constraints.get("is_persistent", None) is not None:
         is_persistent = constraints["is_persistent"]
     else:
-        has_simple_epilogue = precision_config.max_num_imprecise_acc is None and not has_expensive_epilogue
+        has_simple_epilogue = precision_config.max_num_imprecise_acc is None
         is_persistent = supports_persistent and has_simple_epilogue and (tiles_per_sm >= 2.0 or lhs_dtype.itemsize <= 1) and out_dtype.itemsize < 4
     # block k
     if constraints.get("block_k", None) is not None:
@@ -204,14 +204,20 @@ def make_default_opt_flags_nvidia(
         lhs_dtype,
         rhs_dtype,
     )
+
     if constraints.get("epilogue_subtile", None) is not None:
-        epilogue_subtile = constraints["epilogue_subtile"]
+        subtiles_to_check = [constraints["epilogue_subtile"]]
     else:
-        n1 = opt_flags_nvidia.compute_num_stages(*compute_num_stages_args, False, has_expensive_epilogue)
-        n2 = opt_flags_nvidia.compute_num_stages(*compute_num_stages_args, True, has_expensive_epilogue)
-        epilogue_subtile = n2 > n1 # enable epilogue_subtile if it increases the number of stages
-    # num_stages
-    num_stages = opt_flags_nvidia.compute_num_stages(*compute_num_stages_args, epilogue_subtile, has_expensive_epilogue)
+        subtiles_to_check = [1, 2, 4]
+    num_stages = -1
+    for ep in subtiles_to_check:
+        ns = opt_flags_nvidia.compute_num_stages(*compute_num_stages_args, ep, epilogue_effective_itemsize)
+        if ns > num_stages:
+            epilogue_subtile, num_stages = ep, ns
+    assert num_stages >= 1
+    if constraints.get("num_stages", None):
+        num_stages = constraints["num_stages"]
+
     # fused scatter scratchpad
     if constraints.get("fused_scatter", None) is not None:
         fused_scatter = constraints["fused_scatter"]
@@ -273,7 +279,7 @@ def make_opt_flags(
     routing_data,
     can_use_persistent_tma,
     can_use_fused_scatter,
-    has_expensive_epilogue,
+    epilogue_effective_itemsize,
 ):
     microscaling_ctx = precision_config.mx_ctx
     enforce_bitwise_invariance = precision_config.enforce_bitwise_invariance
@@ -282,7 +288,7 @@ def make_opt_flags(
         return _opt_flags
     args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, microscaling_ctx, m, n, k,
             routing_data, can_use_persistent_tma, can_use_fused_scatter,
-            enforce_bitwise_invariance, has_expensive_epilogue, _opt_flags_constraints]
+            enforce_bitwise_invariance, epilogue_effective_itemsize, _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
     if backend == "hip":
         return make_default_opt_flags_amd(*args)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_nvidia.py
@@ -69,7 +69,7 @@ def compute_num_stages(
     lhs_dtype,
     rhs_dtype,
     epilogue_subtile,
-    has_expensive_epilogue,
+    epilogue_effective_itemsize,
 ):
     if precision_config.max_num_imprecise_acc is not None:
         return 3
@@ -88,19 +88,18 @@ def compute_num_stages(
     if is_persistent:
         # Per-stage wait barrier
         stage_size += 8
-        acc_size = out_dtype.itemsize
         if target_info.cuda_capability_geq(10, 0):
-            acc_size = 4 if has_expensive_epilogue else out_dtype.itemsize
+            acc_size = epilogue_effective_itemsize or out_dtype.itemsize
         else:
             acc_size = out_dtype.itemsize
-        if target_info.cuda_capability_geq(10, 0) and epilogue_subtile and not has_expensive_epilogue:
-            acc_block_n = block_n // 2
+        if target_info.cuda_capability_geq(10, 0) and epilogue_subtile is not None:
+            acc_block_n = block_n // epilogue_subtile
         else:
             acc_block_n = block_n
         # pipelined TMA store local to global, or
         # pipelined layout conversion before store of the accumulator
         # note: layout conversion has some padding
-        smem_capacity -= (block_m + 4) * acc_block_n * acc_size
+        smem_capacity -= int((block_m + 4) * acc_block_n * acc_size)
         if microscaling_ctx.weight_scale is not None:
             # mx scales
             stage_size += block_n * (block_k // 32)