[AMD] Optimize to bypass ds_bpermute for direct-to-lds loads (#7064)

AlexAUT · web-flow · commit 72b2d9b96f6f · 2025-06-05T08:35:41.000-07:00
If the fastest dim we load elements from hbm is contiguous we can apply
the laneOffset to the pointers/buffer offsets to get the swizzled
addresses. This only works because we are swapping the elements between
lanes and we only swizzle in the fastest dim.

In general this performs better than using ds_bpermute.
diff --git a/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir b/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
@@ -271,7 +271,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
 
     // Each thread needs to load 8 elements and we load 8 (sizePerThread) per buffer load instruction
     // GFX950: rocdl.make.buffer.rsrc
-    // GFX950: rocdl.ds_bpermute
+    // Src ptrs are contiguous so we do expect to bypass the ds_bpermute (see lowering to LLVM)
+    // GFX950-NOT: rocdl.ds_bpermute
     // GFX950: rocdl.raw.ptr.buffer.load.lds
     // GFX950-NOT: rocdl.raw.ptr.buffer.load.lds
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -239,21 +239,19 @@ struct DirectToLdsLoadConversionBase : public LoadStoreConversionBase {
     }
   }
 
-  // Emits the computation to get the lane index which holds the source
+  // Emits the computation to get the lane id offset which holds the source
   // pointers/offsets we need to store to shared memory
-  Value emitSwizzledLaneIndex(RewriterBase &rewriter, TritonLLVMOpBuilder &b,
-                              Location loc, Value coalescedShmem,
-                              Value swizzledShmem, Value vecBytes) const {
+  Value emitSwizzledLaneOffset(RewriterBase &rewriter, TritonLLVMOpBuilder &b,
+                               Location loc, Value coalescedShmem,
+                               Value swizzledShmem, Value vecBytes) const {
     // Compute the laneOffset based on the difference in elements between
     // the two shmem addresses. laneOffset will be negative for half the
     // lanes because a smaller laneId might hold our global_ptr.
     auto coalescedAddr = b.ptrtoint(i64_ty, coalescedShmem);
     auto swizzledAddr = b.ptrtoint(i64_ty, swizzledShmem);
     auto diff = b.trunc(i32_ty, b.sub(swizzledAddr, coalescedAddr));
     Value laneOffset = b.sdiv(diff, vecBytes);
-    // laneId + laneOffset will always stay inside the warp [0,
-    // threadsPerWarp) because we only swizzle inside a warp
-    return b.add(getLaneId(rewriter, loc), laneOffset);
+    return laneOffset;
   }
 
   // Swizzle the mask (1bit) based on selectLane via ballot
@@ -266,6 +264,21 @@ struct DirectToLdsLoadConversionBase : public LoadStoreConversionBase {
     auto bitMask = b.lshr(warpMask, b.zext(rewriter.getI64Type(), selectLane));
     return b.trunc(i1_ty, bitMask);
   }
+
+  // For direct-to-lds the order of the shared encoding decides the order we
+  // load elements from global memory. This function returns true if the fastest
+  // dim for the sharedEnc is contiguous for the global ptrs/offsets
+  bool isFastedLoadDimContiguous(Value srcPtrOrOffset,
+                                 MemDescType sharedTy) const {
+    auto fastestDim = triton::gpu::getOrder(sharedTy)[0];
+    AxisInfo *axisInfo = axisAnalysisPass.getAxisInfo(srcPtrOrOffset);
+
+    // This can happen if axis analysis fails (e.g. lit tests).
+    if (axisInfo->getRank() <= fastestDim)
+      return false;
+
+    return axisInfo->getContiguity(fastestDim) > 1;
+  }
 };
 
 struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
@@ -542,11 +555,26 @@ struct BufferLoadToLocalOpConversion
 
       if (hasSwizzling) {
         // Apply swizzling to the src offsets
-        Value swizzledLaneId =
-            emitSwizzledLaneIndex(rewriter, b, loc, coalescedShmemAddr[i],
-                                  swizzledShmemAddr[i], vecBytesVal);
-        offsetIn =
-            targetInfo.shuffleIdx(rewriter, loc, offsetIn, swizzledLaneId);
+        Value laneOffset =
+            emitSwizzledLaneOffset(rewriter, b, loc, coalescedShmemAddr[i],
+                                   swizzledShmemAddr[i], vecBytesVal);
+        // laneId + laneOffset will always stay inside the warp [0,
+        // threadsPerWarp) because we only swizzle inside a warp
+        Value swizzledLaneId = b.add(getLaneId(rewriter, loc), laneOffset);
+
+        if (isFastedLoadDimContiguous(offset, cast<MemDescType>(dstTy))) {
+          // Because rows are contiguous and we only swizzle inside rows by
+          // swapping elements between lanes we can add laneOffset * vecSize to
+          // the offset to apply the swizzling
+          offsetIn = b.add(
+              offsetIn, b.mul(laneOffset, b.i32_val(vecTy.getNumElements())));
+        } else {
+          // If rows are not contiguous in memory we need to shuffle the
+          // pointers to apply the swizzling to the src pointers
+          offsetIn =
+              targetInfo.shuffleIdx(rewriter, loc, offsetIn, swizzledLaneId);
+        }
+
         if (mask) {
           pred =
               shuffleMask(rewriter, b, loc, targetInfo, swizzledLaneId, pred);
@@ -666,10 +694,23 @@ struct AsyncCopyGlobalToLocalOpConversion
 
       if (hasSwizzling) {
         // Apply swizzling to the src pointers
-        Value swizzledLaneId =
-            emitSwizzledLaneIndex(rewriter, b, loc, coalescedShmemAddr[i],
-                                  swizzledShmemAddr[i], vecBytesVal);
-        srcPtr = targetInfo.shuffleIdx(rewriter, loc, srcPtr, swizzledLaneId);
+        Value laneOffset =
+            emitSwizzledLaneOffset(rewriter, b, loc, coalescedShmemAddr[i],
+                                   swizzledShmemAddr[i], vecBytesVal);
+        // laneId + laneOffset will always stay inside the warp [0,
+        // threadsPerWarp) because we only swizzle inside a warp
+        Value swizzledLaneId = b.add(getLaneId(rewriter, loc), laneOffset);
+
+        if (isFastedLoadDimContiguous(op.getSrc(), cast<MemDescType>(dstTy))) {
+          // Because rows are contiguous and we only swizzle inside rows by
+          // swapping elements between lanes we can move the vecTy typed src
+          // pointer by laneOffset elements to apply the swizzling.
+          srcPtr = b.gep(srcPtr.getType(), vecTy, srcPtr, laneOffset);
+        } else {
+          // If rows are not contiguous in memory we need to shuffle the
+          // pointers to apply the swizzling to the src pointers
+          srcPtr = targetInfo.shuffleIdx(rewriter, loc, srcPtr, swizzledLaneId);
+        }
         if (!maskElements.empty()) {
           pred =
               shuffleMask(rewriter, b, loc, targetInfo, swizzledLaneId, pred);