[ConvertLayoutOpToLLVM] Reintroduce transferWithinBlock

whitneywhtsang · whitneywhtsang · commit 0d2fd2daa5e3 · 2025-08-14T14:23:00.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -208,8 +208,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     auto dstTy = cvtLayout.getType();
     if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
-    // The generic pass uses swizzling
-    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
+    // Pesimistically take the max. We will revisit later
+    auto elems = std::max(getNumScratchElemsSwizzledCvt(srcTy, dstTy),
+                          getNumScratchElemsPaddedCvt(srcTy, dstTy));
+
     return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -63,8 +63,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     } else if (llvm::is_contained(dims, kWarp)) {
       // Case 2: Transfer between values in the same CTA, in which case we move
       //         values through shared memory.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
-      return success();
+      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
     } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
       //         to move values using warp shuffles, though if the pattern is
@@ -75,8 +74,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       // TODO: Since data is only transferred within a warp over shared memory,
       // we should use `bar.warp.sync` instead of `barrier`, which will improve
       // latency when warps issue barriers on different cycles.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
-      return success();
+      return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
     } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
@@ -112,6 +110,27 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     return success();
   }
 
+  LogicalResult transferWithinBlock(ConvertLayoutOp op,
+                                    const LinearLayout &srcLayout,
+                                    const LinearLayout &dstLayout,
+                                    OpAdaptor adaptor,
+                                    ConversionPatternRewriter &rewriter) const {
+    assert(cvtNeedsSharedMemory(op.getSrc().getType(), op.getType()));
+
+    // Try to use swizzling to implement the conversion
+    // HACK Remove once XPU tests pass for the swizzling path
+    if (!targetInfo.isXpu()) {
+      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+      return success();
+    }
+
+    Value result = transferWithinBlockPadding(op, adaptor.getSrc(), targetInfo,
+                                              getTypeConverter(), rewriter);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
   SmallVector<Value> transferWithinBlockSwizzlingImpl(
       Location loc, ConversionPatternRewriter &rewriter,
       const LinearLayout &srcLayout, const LinearLayout &dstLayout,
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -201,22 +201,22 @@ tt.func @longlive(%A : !tt.ptr<f16>) {
 
 // This example triggers graph coloring with > 1 colors.
 // expected-remark @below {{multi_color}}
-// expected-remark @below {{size = 1376}}
+// expected-remark @below {{size = 1504}}
 tt.func @multi_color(%A : !tt.ptr<f16>) {
-  // expected-remark @below {{offset = 1024, size = 64}}
+  // expected-remark @below {{offset = 1152, size = 64}}
   %cst = ttg.local_alloc : () -> !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // expected-remark @below {{offset = 1344, size = 32}}
+  // expected-remark @below {{offset = 1472, size = 32}}
   %cst_0 = ttg.local_alloc : () -> !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // expected-remark @below {{offset = 1088, size = 128}}
+  // expected-remark @below {{offset = 1216, size = 128}}
   %cst_1 = ttg.local_alloc : () -> !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
-  // expected-remark @below {{scratch offset = 0, size = 1024}}
+  // expected-remark @below {{scratch offset = 0, size = 1152}}
   %0 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   %1 = ttg.local_load %cst : !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x8xf16, #AL>
   // expected-remark @below {{offset = 0, size = 128}}
   %cst_3 = ttg.local_alloc : () -> !ttg.memdesc<4x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %2 = ttg.local_load %cst_0 : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
-  // expected-remark @below {{scratch offset = 0, size = 1024}}
+  // expected-remark @below {{scratch offset = 0, size = 1152}}
   %3 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   // expected-remark @below {{offset = 512, size = 256}}
   %cst_4 = ttg.local_alloc : () -> !ttg.memdesc<4x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
@@ -226,7 +226,7 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
   %5 = ttg.local_load %cst_5 : !ttg.memdesc<4x8xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x8xf16, #AL>
   // expected-remark @below {{offset = 0, size = 512}}
   %cst_6 = ttg.local_alloc : () -> !ttg.memdesc<8x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // expected-remark @below {{offset = 1216, size = 128}}
+  // expected-remark @below {{offset = 1344, size = 128}}
   %cst_7 = ttg.local_alloc : () -> !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %6 = ttg.local_load %cst_0 : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // expected-remark @below {{offset = 0, size = 512}}
@@ -237,7 +237,7 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
   %cst_10 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %7 = ttg.local_load %cst_1 : !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x4xf16, #AL>
   %8 = ttg.local_load %cst_4 : !ttg.memdesc<4x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x32xf16, #AL>
-  // expected-remark @below {{scratch offset = 0, size = 1024}}
+  // expected-remark @below {{scratch offset = 0, size = 1152}}
   %9 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   %cst_11 = arith.constant dense<0.000000e+00> : tensor<4x4xf16, #AL>
   %10 = ttg.local_load %cst_7 : !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<2x32xf16, #AL>
@@ -248,16 +248,16 @@ tt.func @multi_color(%A : !tt.ptr<f16>) {
 
 // This example triggers graph coloring with multiple rounds
 // expected-remark @below {{multi_color_multi_rounds}}
-// expected-remark @below {{size = 9376}}
+// expected-remark @below {{size = 9504}}
 tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
-  // expected-remark @below {{offset = 9344, size = 32}}
+  // expected-remark @below {{offset = 9472, size = 32}}
   %cst = ttg.local_alloc : () -> !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  // expected-remark @below {{offset = 9216, size = 128}}
+  // expected-remark @below {{offset = 9344, size = 128}}
   %cst_0 = ttg.local_alloc : () -> !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
   // expected-remark @below {{offset = 0, size = 8192}}
   %cst_1 = ttg.local_alloc : () -> !ttg.memdesc<1024x4xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
-  // expected-remark @below {{scratch offset = 8192, size = 1024}}
+  // expected-remark @below {{scratch offset = 8192, size = 1152}}
   %0 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   %1 = ttg.local_load %cst : !ttg.memdesc<4x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<4x4xf16, #AL>
   // expected-remark @below {{offset = 8704, size = 128}}
@@ -267,7 +267,7 @@ tt.func @multi_color_multi_rounds(%arg0: !tt.ptr<f16>) {
   %cst_4 = ttg.local_alloc : () -> !ttg.memdesc<1x16x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
   %3 = ttg.local_load %cst_0 : !ttg.memdesc<16x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x4xf16, #AL>
   %4 = ttg.local_load %cst_1 : !ttg.memdesc<1024x4xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<1024x4xf16, #AL>
-  // expected-remark @below {{scratch offset = 0, size = 1024}}
+  // expected-remark @below {{scratch offset = 0, size = 1152}}
   %5 = ttg.convert_layout %cst_2 : tensor<16x32xf16, #AL> -> tensor<16x32xf16, #BL>
   %6 = ttg.local_load %cst_3 : !ttg.memdesc<2x32xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<2x32xf16, #AL>
   tt.return