[AMD] Support Skinny Blocks for TDM on gfx1250 (#8479)

knwng · web-flow · commit 3251bb83e4bf · 2025-10-18T20:49:37.000-07:00
This PR changed warps distribution to support TDM load/store skinny
blocks like 1x512.
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp
@@ -35,6 +35,19 @@ decodeTDMDescriptor(RewriterBase &rewriter, Location loc,
 
   return {srcPtr, tensorShape, tensorStride};
 }
+
+SmallVector<int> getWarpDistribution(ArrayRef<int64_t> blockShape,
+                                     int numWarps) {
+  int numWarpsDim0 = numWarps;
+  for (; numWarpsDim0 > blockShape[0]; numWarpsDim0 /= 2)
+    ;
+  int numWarpsDim1 = numWarps / numWarpsDim0;
+
+  assert(numWarpsDim0 > 0 && blockShape[1] % numWarpsDim1 == 0 &&
+         "Can't distribute warps in TDM");
+
+  return {numWarpsDim0, numWarpsDim1};
+}
 } // namespace
 
 std::pair<SmallVector<Value>, SmallVector<Value>>
@@ -56,8 +69,10 @@ createTDMDescriptor(RewriterBase &rewriter, Location loc,
   tensorStride[0] = b.trunc(i32_ty, tensorStride[0]);
   tensorStride[1] = b.trunc(i32_ty, tensorStride[1]);
 
-  // For block shape [M, N], each warp will handle shape [M/numWarps, N].
-  blockShape[0] = ceil(blockShape[0], int64_t(numWarps));
+  // Distribute block among warps
+  auto warps = getWarpDistribution(blockShape, numWarps);
+  blockShape[0] = ceil(blockShape[0], int64_t(warps[0]));
+  blockShape[1] = ceil(blockShape[1], int64_t(warps[1]));
 
   // group0 (128 bits / 4 dwords) effective bit encoding:
   // [1:0]:     pred (to be filled later)
@@ -122,19 +137,27 @@ void fillTDMDescriptor(RewriterBase &rewriter, Location loc,
       decodeTDMDescriptor(rewriter, loc, group0, group1);
 
   auto warpId = getLaneAndWarpId(rewriter, loc).second;
-  int outerBlockShapePerWarp = ceil(blockShape[0], int64_t(numWarps));
-  int outerBlockStride = blockShape[1];
+  auto warps = getWarpDistribution(blockShape, numWarps);
 
   // Shift global pointer by offset
-  Value outerOffset = b.mul(b.i32_val(outerBlockShapePerWarp), warpId);
-  offset[0] = b.add(offset[0], outerOffset);
+  Value warpDim0 = b.i32_val(warps[0]);
+  SmallVector<Value, 2> warpCoord = {b.urem(warpId, warpDim0),
+                                     b.udiv(warpId, warpDim0)};
+
+  SmallVector<Value, 2> globalOffset;
+  for (int i = 0; i < 2; i++) {
+    int64_t blockShapePerWarp = ceil(blockShape[i], int64_t(warps[i]));
+    globalOffset.push_back(b.mul(b.i32_val(blockShapePerWarp), warpCoord[i]));
+    offset[i] = b.add(offset[i], globalOffset[i]);
+  }
 
   Value baseOffset = b.add(b.mul(tensorStride[0], offset[0]),
                            b.mul(tensorStride[1], offset[1]));
   srcPtr = b.gep(globalPtrTy, elementType, srcPtr, baseOffset);
 
   // Shift shared pointer by offset
-  Value dstOffset = b.mul(b.i32_val(outerBlockStride), outerOffset);
+  Value dstOffset =
+      b.add(b.mul(b.i32_val(blockShape[1]), globalOffset[0]), globalOffset[1]);
   if (padInterval > 0 && padAmount > 0) {
     Value iVal = b.i32_val(log2(padInterval));
     Value pVal = b.i32_val(log2(padAmount));
diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py
@@ -365,9 +365,9 @@ def torch_gemm_mxfp(a, b, a_scale, b_scale, scale_block, M, N, K):
 
 @gluon.jit
 def tensor_copy_kernel(a_ptr, b_ptr, M, N,  #
-                       BLOCK_M: ttgl.constexpr, BLOCK_N: ttgl.constexpr, NUM_BUFFERS: ttgl.constexpr):
+                       BLOCK_M: ttgl.constexpr, BLOCK_N: ttgl.constexpr, NUM_BUFFERS: ttgl.constexpr,
+                       BLOCKED_LAYOUT: ttgl.constexpr):
     SHARED_LAYOUT: ttgl.constexpr = ttgl.PaddedSharedLayout.with_identity_for([[32, 4]], [BLOCK_M, BLOCK_N], [1, 0])
-    BLOCKED_LAYOUT: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [4, 8], [4, 1], [1, 0])
 
     pid = ttgl.program_id(axis=0)
     num_pid_m = ttgl.cdiv(M, BLOCK_M)
@@ -400,31 +400,38 @@ def tensor_copy_kernel(a_ptr, b_ptr, M, N,  #
 @pytest.mark.parametrize("BLOCK_M,BLOCK_N", [(32, 32), (32, 64), (64, 64)])
 @pytest.mark.parametrize("NUM_BUFFERS", [1, 2])
 def test_compile_tensor_copy(BLOCK_M, BLOCK_N, NUM_BUFFERS):
+    BLOCKED_LAYOUT = ttgl.BlockedLayout([1, 8], [4, 8], [4, 1], [1, 0])
     k = triton.compile(
         gluon._runtime.GluonASTSource(
             fn=tensor_copy_kernel, signature={
                 "a_ptr": "*fp16", "b_ptr": "*fp16", "M": "i32", "N": "i32",  #
-                "BLOCK_M": "constexpr", "BLOCK_N": "constexpr", "NUM_BUFFERS": "constexpr"
-            }, constexprs={"BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N, "NUM_BUFFERS": NUM_BUFFERS}),
-        target=GPUTarget("hip", 'gfx1250', 32))
+                "BLOCK_M": "constexpr", "BLOCK_N": "constexpr", "NUM_BUFFERS": "constexpr",  #
+                "BLOCKED_LAYOUT": "constexpr"
+            }, constexprs={
+                "BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N, "NUM_BUFFERS": NUM_BUFFERS, "BLOCKED_LAYOUT": BLOCKED_LAYOUT
+            }), target=GPUTarget("hip", 'gfx1250', 32))
 
     amdgcn = k.asm["amdgcn"]
     for pattern in ("tensor_load_to_lds", "s_wait_tensorcnt 0x0"):
         assert re.search(pattern, amdgcn)
 
 
-@pytest.mark.parametrize("BLOCK_M,BLOCK_N", [(32, 32), (32, 64), (64, 64)])
+@pytest.mark.parametrize("BLOCK_M,BLOCK_N", [(32, 32), (32, 64), (64, 64), (1, 512), (256, 2)])
 @pytest.mark.parametrize("NUM_BUFFERS", [1, 2])
+@pytest.mark.parametrize("NUM_WARPS", [4, 8])
 @pytest.mark.parametrize("M,N", [(1024, 1024), (1000, 1000)])
-def test_runtime_tensor_copy(M, N, BLOCK_M, BLOCK_N, NUM_BUFFERS):
+def test_runtime_tensor_copy(M, N, BLOCK_M, BLOCK_N, NUM_BUFFERS, NUM_WARPS):
+    blocked_layout = ttgl.BlockedLayout([1, 8], [4, 8], [NUM_WARPS, 1], [1, 0])
+
     torch.manual_seed(42)
     a = torch.randint(0x0, 0xFFFF, (M, N), dtype=torch.uint16)
     b = torch.zeros_like(a)
 
     a_device = a.cuda()
     b_device = b.cuda()
     grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N * NUM_BUFFERS), 1)
-    tensor_copy_kernel[grid](a_device, b_device, M, N, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, NUM_BUFFERS=NUM_BUFFERS)
+    tensor_copy_kernel[grid](a_device, b_device, M, N, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, NUM_BUFFERS=NUM_BUFFERS,
+                             BLOCKED_LAYOUT=blocked_layout, num_warps=NUM_WARPS)
 
     b_triton = b_device.cpu()
     assert torch.equal(b_triton, a)