[AMD] Add general swizzling support for allocation (#7606)

niconunezz · web-flow · commit 93367dc1fb42 · 2025-07-23T18:26:51.000Z
This PR removes the old assertion now that General swizzling for convert
layout is suported in AMD backend.

Until recently, the AMD backend lacked support for General Swizzling in
ConvertLayoutOp operations. To handle this limitation, the code
contained a blocking assertion that would terminate execution if the
swizzling path was attempted.
This wasn't immediately problematic because the system could fall back
to an alternative approach that achieved the same functional result
using `defaultAllocationAnalysisScratchSizeFn` . Now that General
Swizzling is operational, this assertion can be safely removed, allowing
ConvertLayoutOp to utilize `getConvertLayoutScratchInBytes` directly
rather than relying on external implementations.
diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h
@@ -63,6 +63,11 @@ getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy);
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy);
 
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy);
+
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy);
 } // namespace triton
 
 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -39,8 +39,8 @@ static unsigned getBitwidth(RankedTensorType ty) {
   return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
 }
 
-static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
-                                              RankedTensorType dstTy) {
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy) {
   auto *ctx = srcTy.getContext();
   auto srcLayout = gpu::toLinearLayout(srcTy);
   auto dstLayout = gpu::toLinearLayout(dstTy);
@@ -52,8 +52,8 @@ static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
   return smem.getTotalOutDimSize() / reps;
 }
 
-static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
-                                            RankedTensorType dstTy) {
+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,
+                                     RankedTensorType dstTy) {
   auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
   return getNumScratchElements(scratchConfig.paddedRepShape);
 }
diff --git a/test/TritonGPU/amd/optimize-lds-usage.mlir b/test/TritonGPU/amd/optimize-lds-usage.mlir
@@ -36,9 +36,9 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 #shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @alloc_convert_small_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf16, #blocked>) attributes {noinline = false} {
+  tt.func public @alloc_convert_small_load(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<256x128xf16, #blocked>) attributes {noinline = false} {
     %1 = ttg.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !ttg.memdesc<128x128xf16, #shared, #smem>
-    %2 = ttg.convert_layout %arg1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #mma>
+    %2 = ttg.convert_layout %arg1 : tensor<256x128xf16, #blocked> -> tensor<256x128xf16, #mma>
     %3 = ttg.local_load %1 : !ttg.memdesc<128x128xf16, #shared, #smem> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
   }
@@ -62,9 +62,9 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 #shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1, 2]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @alloc_convert_3d_load(%arg0: tensor<1x128x128xf16, #blocked>, %arg1: tensor<1x128x128xf16, #blocked>) attributes {noinline = false} {
+  tt.func public @alloc_convert_3d_load(%arg0: tensor<1x128x128xf16, #blocked>, %arg1: tensor<1x256x128xf16, #blocked>) attributes {noinline = false} {
     %1 = ttg.local_alloc %arg0 : (tensor<1x128x128xf16, #blocked>) -> !ttg.memdesc<1x128x128xf16, #shared, #smem>
-    %2 = ttg.convert_layout %arg1 : tensor<1x128x128xf16, #blocked> -> tensor<1x128x128xf16, #mma>
+    %2 = ttg.convert_layout %arg1 : tensor<1x256x128xf16, #blocked> -> tensor<1x256x128xf16, #mma>
     %3 = ttg.local_load %1 : !ttg.memdesc<1x128x128xf16, #shared, #smem> -> tensor<1x128x128xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
     tt.return
   }
@@ -87,9 +87,9 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 #shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @alloc_convert_32k_limit(%arg0: tensor<64x128xf16, #blocked>, %arg1: tensor<64x128xf16, #blocked>) attributes {noinline = false} {
+  tt.func public @alloc_convert_32k_limit(%arg0: tensor<64x128xf16, #blocked>, %arg1: tensor<128x128xf16, #blocked>) attributes {noinline = false} {
     %1 = ttg.local_alloc %arg0 : (tensor<64x128xf16, #blocked>) -> !ttg.memdesc<64x128xf16, #shared, #smem>
-    %2 = ttg.convert_layout %arg1 : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #mma>
+    %2 = ttg.convert_layout %arg1 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #mma>
     %3 = ttg.local_load %1 : !ttg.memdesc<64x128xf16, #shared, #smem> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 0, kWidth = 4, parent = #mma}>>
     tt.return
   }
@@ -98,29 +98,29 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 // -----
 
 // Check that optimization correctly handles LDS shortcut (see #mma2 -> #dotop2 conversion)
-// CHECK-DAG: [[BLOCKED_1:#[a-z0-9]*]] = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+// CHECK-DAG: [[BLOCKED_1:#[a-z0-9]*]] = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 // CHECK-DAG: [[BLOCKED_2:#[a-z0-9]*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [2, 4], order = [0, 1]}>
 // CHECK-DAG: [[MMA_1:#[a-z0-9]*]] = #ttg.amd_mfma<{version = 2, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}>
 // CHECK-DAG: [[MMA_2:#[a-z0-9]*]] = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
 // CHECK-DAG: [[SHARED:#[a-z0-9]*]] = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1]}>
 
 // CHECK: tt.func public @mfma_dot_shortcut([[ARG_0:%[a-z0-9]*]]: {{.*}}, [[ARG_1:%[a-z0-9]*]]: {{.*}}, [[ARG_2:%[a-z0-9]*]]: {{.*}})
 // CHECK: [[ALLOC:%[0-9]+]] = ttg.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !ttg.memdesc<128x128xf16, [[SHARED]], #smem>
-// CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] {{.*}}: tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
-// CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] {{.*}}: tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
+// CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] {{.*}}: tensor<256x128xf32, [[BLOCKED_1]]> -> tensor<256x128xf32, [[BLOCKED_2]]>
+// CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] {{.*}}: tensor<256x128xf32, [[BLOCKED_2]]> -> tensor<256x128xf32, [[MMA_2]]>
 // CHECK: [[CONVERT_2:%[0-9]+]] = ttg.convert_layout [[ARG_2]] {{.*}}: tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
 // CHECK: [[LOAD:%[0-9]+]] = ttg.local_load [[ALLOC]] : !ttg.memdesc<128x128xf16, [[SHARED]], #smem> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>>
-#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma1 = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
 #mma2 = #ttg.amd_mfma<{version = 2, warpsPerCTA = [8, 1], instrShape = [32, 32], isTransposed = true}>
 #dotop1 = #ttg.dot_op<{opIdx=0, parent=#mma1, kWidth=4}>
 #dotop2 = #ttg.dot_op<{opIdx=0, parent=#mma2, kWidth=4}>
 #shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [0, 1]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @mfma_dot_shortcut(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x128xf32, #blocked>, %arg2: tensor<256x128xf16, #mma2>) attributes {noinline = false} {
+  tt.func public @mfma_dot_shortcut(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<256x128xf32, #blocked>, %arg2: tensor<256x128xf16, #mma2>) attributes {noinline = false} {
     %alloc = ttg.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !ttg.memdesc<128x128xf16, #shared, #smem>
-    %convert_1 = ttg.convert_layout %arg1 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #mma1>
+    %convert_1 = ttg.convert_layout %arg1 : tensor<256x128xf32, #blocked> -> tensor<256x128xf32, #mma1>
     %convert_2 = ttg.convert_layout %arg2 : tensor<256x128xf16, #mma2> -> tensor<256x128xf16, #dotop2>
     %load = ttg.local_load %alloc : !ttg.memdesc<128x128xf16, #shared, #smem> -> tensor<128x128xf16, #dotop1>
     tt.return
diff --git a/third_party/amd/lib/Analysis/AMDGPUAllocation.cpp b/third_party/amd/lib/Analysis/AMDGPUAllocation.cpp
@@ -19,25 +19,22 @@ unsigned getConvertLayoutScratchInBytes(RankedTensorType srcTy,
     return 0;
   unsigned elems = 0;
   if (usePadding) {
-    auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-    elems = getNumScratchElements(scratchConfig.paddedRepShape);
+    elems = getNumScratchElemsPaddedCvt(srcTy, dstTy);
   } else {
-    assert(false && "General swizzling for convert layout is not suported in "
-                    "AMD backend yet");
-    // TODO use swizzling
+    elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
   }
   return elems * getBitwidth(srcTy) / 8;
 }
 
 unsigned AMDAllocationAnalysisScratchSizeFn(Operation *op) {
-  if (op->hasAttr(AttrSharedMemPadded)) {
-    if (auto cvtLayout = dyn_cast<mlir::triton::gpu::ConvertLayoutOp>(op)) {
-      auto srcTy = cvtLayout.getSrc().getType();
-      auto dstTy = cvtLayout.getType();
-      return getConvertLayoutScratchInBytes(srcTy, dstTy,
-                                            op->hasAttr(AttrSharedMemPadded));
-    }
+
+  if (auto cvtLayout = dyn_cast<mlir::triton::gpu::ConvertLayoutOp>(op)) {
+    auto srcTy = cvtLayout.getSrc().getType();
+    auto dstTy = cvtLayout.getType();
+    return getConvertLayoutScratchInBytes(srcTy, dstTy,
+                                          op->hasAttr(AttrSharedMemPadded));
   }
+
   return defaultAllocationAnalysisScratchSizeFn(op);
 }
 

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@ static unsigned getBitwidth(RankedTensorType ty) {`
`39`	`39`	`return isPtr ? kPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);`
`40`	`40`	`}`
`41`	`41`
`42`		`-static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
`43`		`- RankedTensorType dstTy) {`
	`42`	`+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
	`43`	`+ RankedTensorType dstTy) {`
`44`	`44`	`auto *ctx = srcTy.getContext();`
`45`	`45`	`auto srcLayout = gpu::toLinearLayout(srcTy);`
`46`	`46`	`auto dstLayout = gpu::toLinearLayout(dstTy);`
`@@ -52,8 +52,8 @@ static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,`
`52`	`52`	`return smem.getTotalOutDimSize() / reps;`
`53`	`53`	`}`
`54`	`54`
`55`		`-static unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,`
`56`		`- RankedTensorType dstTy) {`
	`55`	`+unsigned getNumScratchElemsPaddedCvt(RankedTensorType srcTy,`
	`56`	`+ RankedTensorType dstTy) {`
`57`	`57`	`auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);`
`58`	`58`	`return getNumScratchElements(scratchConfig.paddedRepShape);`
`59`	`59`	`}`