[TLX] Refactor grouped gemm with configurable sublicing (#651)

htyu · meta-codesync[bot] · commit 341b78b02cb8 · 2025-11-10T11:54:17.000-08:00
Summary: Subslicing enables bigger tile size and more pipeline stages. It benefits certain shapes: Triton autotuning for function grouped_matmul_tlx_kernel, best config selected: BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 64, NUM_SMEM_BUFFERS: 3, NUM_TMEM_BUFFERS: 2, EPILOGUE_SUBTILE: **4**, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None; Pull Request resolved: #651 Reviewed By: manman-ren Differential Revision: D86577923 Pulled By: htyu fbshipit-source-id: cda92b66d1a727dbc1279792a0c931deead84db9
diff --git a/third_party/tlx/tutorials/blackwell-grouped-gemm.py b/third_party/tlx/tutorials/blackwell-grouped-gemm.py
@@ -344,7 +344,7 @@ def _get_bufidx_phase(accum_cnt, NUM_BUFFERS_KV):
         },
         num_warps=4,
         num_stages=1,
-    ) for BM in [128] for BN in [128, 256] for BK in [64, 128] for s in [2, 3, 4] for t in [2] for subtile in [False]
+    ) for BM in [128] for BN in [128, 256] for BK in [64, 128] for s in [2, 3, 4] for t in [2] for subtile in [1, 2, 4]
 ]
 
 
@@ -411,7 +411,7 @@ def grouped_matmul_tlx_kernel(
                         c_ptr,
                         shape=[gm, gn],
                         strides=[ldc, 1],
-                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N // EPILOGUE_SUBTILE],
                     )
 
                     # iterate through the tiles in the current gemm problem
@@ -430,21 +430,16 @@ def grouped_matmul_tlx_kernel(
                         offs_cm = tile_m_idx * BLOCK_SIZE_M
                         offs_cn = tile_n_idx * BLOCK_SIZE_N
 
-                        if EPILOGUE_SUBTILE:
-                            # We load/store the result half by half to reduce SMEM pressure
-                            acc_tmem_subslice1 = tlx.subslice(acc_tmem, 0, BLOCK_SIZE_N // 2)
-                            result = tlx.local_load(acc_tmem_subslice1)
+                        slice_size: tl.constexpr = BLOCK_SIZE_N // EPILOGUE_SUBTILE
+                        for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                            acc_slice = tlx.local_slice(
+                                acc_tmem,
+                                [0, slice_id * slice_size],
+                                [BLOCK_SIZE_M, slice_size],
+                            )
+                            result = tlx.local_load(acc_slice)
                             c = result.to(tl.float16)
-                            c_desc.store([offs_cm, offs_cn], c)
-
-                            acc_tmem_subslice2 = tlx.subslice(acc_tmem, BLOCK_SIZE_N // 2, BLOCK_SIZE_N // 2)
-                            result = tlx.local_load(acc_tmem_subslice2)
-                            c = result.to(tl.float16)
-                            c_desc.store([offs_cm, offs_cn + BLOCK_SIZE_N // 2], c)
-                        else:
-                            result = tlx.local_load(acc_tmem)
-                            c = result.to(tl.float16)
-                            c_desc.store([offs_cm, offs_cn], c)
+                            c_desc.store([offs_cm, offs_cn + slice_id * slice_size], c)
 
                         # done storing this buffer, signal MMA consumer to resume writing to it
                         tlx.barrier_arrive(tmem_empty_bars[tmem_buf], 1)