[AMD] Optimize shared address calculation for async load (#7153)

AlexAUT · antiagainst · web-flow · commit 219433ea0960 · 2025-06-12T13:02:57.000-07:00
On GFX9 direct-to-lds loads write coalesced to LDS and therefore require
the start LDS address as a scalar. This PR refactors the address
calculation to uniformly compute the start address instead of per lane
addresses. This improves final codegen and reduces register usage.
The swizzling computations are now based on the offset instead of the
final addresses which further helps codegen.

The lowering can produce incorrect loads in some cases if we store into
a sub-view which slices along the two minor dimensions, so pipelining is
fine. This was already the case before the refactoring and will be
converted to an error in a follow up PR.

---------

Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -543,6 +543,13 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
+[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
+    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
+    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    Value laneId, Value warpId,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
+
 SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
                                            Type elemLlvmTy,
                                            const SharedMemoryObject &smemObj,
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -417,6 +417,18 @@ bool emitTransferBetweenRegistersAndShared(
     std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
+  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
+  return emitTransferBetweenRegistersAndShared(
+      regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
+      target, laneId, warpId, perVectorCallback);
+}
+
+bool emitTransferBetweenRegistersAndShared(
+    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
+    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    Value laneId, Value warpId,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
   MLIRContext *ctx = rewriter.getContext();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
@@ -458,7 +470,6 @@ bool emitTransferBetweenRegistersAndShared(
                maxVecElems.value_or(std::numeric_limits<int>::max()));
 
   auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   Value blockId =
       withCTAOffset ? target.getClusterCTAId(rewriter, loc) : b.i32_val(0);
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp