[AMD] Support shared encoding swizzle for BufferLoadToLocal (triton-lang#6329)

AlexAUT · web-flow · commit 07478c2617d7 · 2025-04-01T12:30:55.000-07:00
Adds lowering capabilities for actual swizzled shared encodings. Because
our direct to lds loads must write coalesced we have to apply the
swizzling to the global pointers. This is done by swizzling the global
addresses via `permute` between lanes based on the swizzle pattern.

In the future we might want to apply the swizzling by changing the
source layout so each lane directly computes the right address. But
currently this does not work as Ops like `ExpandDim` do not work if the
distributed layout moves in 2 dimensions in a basis.

Support for non global loads to lds will be a separate PR after this has
landed.
diff --git a/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir b/test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
@@ -162,3 +162,123 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [8, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_swizzled_simple
+  tt.func public @buffer_load_swizzled_simple(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: !tt.ptr<f16>,
+                                %arg2: tensor<16x64xi32, #blocked>,
+                                %arg3: !ttg.memdesc<16x64xf16, #shared, #smem, mutable>) {
+    // Each thread needs to load 2 elements and we load 1 (sizePerThread) per buffer load instruction
+    // COMMON: rocdl.make.buffer.rsrc
+    // COMMON-NOT: rocdl.make.buffer.rsrc
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON-NOT: rocdl.raw.ptr.buffer.load.lds
+    %65 = amdgpu.buffer_load_to_local %arg1[%arg2] into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f16>[tensor<16x64xi32, #blocked>] -> <16x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 2, maxPhase = 8, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_to_local_swizzled_mask_other
+  tt.func public @buffer_load_to_local_swizzled_mask_other(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: !tt.ptr<f16>,
+                                %arg2: tensor<32x32xi32, #blocked>,
+                                %arg3: !ttg.memdesc<32x32xf16, #shared, #smem, mutable>,
+                                %arg4: i32) {
+    // We need the splat to allow the AxisAnalysis to work during lowering
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %1 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x32x!tt.ptr<f16>, #blocked>
+    %29 = arith.addi %arg4, %c31_i32 : i32
+    %30 = arith.divsi %29, %c32_i32 : i32
+    %31 = arith.cmpi sgt, %30, %c0_i32 : i32
+
+    %51 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %52 = tt.expand_dims %51 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked>
+    %65 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked>
+    %66 = arith.cmpi slt, %52, %65 : tensor<32x1xi32, #blocked>
+    %67 = tt.broadcast %66 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked>
+
+    %70 = tt.splat %31 : i1 -> tensor<32x32xi1, #blocked>
+    %71 = arith.andi %70, %67 : tensor<32x32xi1, #blocked>
+
+    // Each thread needs to load 4 elements and we load 1 (sizePerThread) per buffer load instruction
+    // Note that mask/other alignment is 1 so we need 4 conditionals
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON-NOT: rocdl.ds_bpermute
+    // COMMON-NOT: rocdl.ballot
+    // COMMON-NOT: rocdl.raw.ptr.buffer.load.lds
+    // COMMON-NOT: _predicated_store
+
+    amdgpu.buffer_load_to_local %arg1[%arg2] mask=%67 other=%cst_0 into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f16>[tensor<32x32xi32, #blocked>] tensor<32x32xf16, #blocked>  -> <32x32xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 32], order = [0, 1]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 4, maxPhase = 16, order = [0, 1]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 0 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_to_local_swizzled_vectorized_8xf16
+  tt.func public @buffer_load_to_local_swizzled_vectorized_8xf16(%arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>) {
+    %cst = arith.constant dense<64> : tensor<1x64xi32, #blocked>
+    %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked>
+    %4 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %5 = arith.muli %4, %cst : tensor<1x64xi32, #blocked>
+    %6 = tt.broadcast %5 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked>
+    %7 = arith.addi %3, %6 : tensor<64x64xi32, #blocked>
+
+    // Each thread needs to load 8 elements and we load 8 (sizePerThread) per buffer load instruction
+    // GFX950: rocdl.make.buffer.rsrc
+    // GFX950: rocdl.ds_bpermute
+    // GFX950: rocdl.raw.ptr.buffer.load.lds
+    // GFX950-NOT: rocdl.raw.ptr.buffer.load.lds
+
+    // GFX942 does not support vectorization > 4bytes so we cannot lower it
+    // GFX942-NOT: rocdl.raw.ptr.buffer.load.lds
+    // GFX942: amdgpu.buffer_load_to_local
+    %8 = amdgpu.buffer_load_to_local %arg1[%7] into %arg2 : <f16>[tensor<64x64xi32, #blocked>]  -> <64x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -420,31 +420,90 @@ struct BufferLoadToLocalOpConversion
     if (llOther)
       otherElems = unpackLLElements(loc, llOther, rewriter);
 
-    // buffer_load into LDS does not support per lane offsets.
-    // We need to ensure that we write coalesced into shared memory.
     auto dstTy = op.getDest().getType();
-    if (!LLVM::AMD::canCoalesceWriteIntoSharedMemory(rewriter, ptrType, dstTy,
-                                                     vec)) {
+    auto sharedEnc = cast<SwizzledSharedEncodingAttr>(dstTy.getEncoding());
+
+    // buffer_load into LDS does not support per lane shared offsets. We need to
+    // ensure that we write coalesced into shared memory.
+    //
+    // For *non* swizzled shared encodings we check if they result in
+    // coalesced writes and can then lower them directly to the intrinsics.
+    //
+    // For swizzled shared encodings we need to transfer the swizzling to the
+    // source pointers. For now this is done by swizzling the pointers between
+    // the lane of a warp via permute. This only works if the swizzle pattern
+    // does not exchange elements between warps which holds for all our swizzle
+    // patterns. There is still a check performed to not silently produce wrong
+    // results if we invalidate the condition in the future
+
+    bool hasSwizzling = sharedEnc.getMaxPhase() != 1;
+
+    // Compute the blocked -> shared linear layout to check preconditions
+    auto shape = ptrType.getShape();
+    LinearLayout srcLayout =
+        triton::gpu::toLinearLayout(shape, ptrType.getEncoding());
+    LinearLayout sharedLayout =
+        triton::gpu::toLinearLayout(shape, dstTy.getEncoding());
+    LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+
+    unsigned threadsPerWarp = lookupThreadsPerWarp(rewriter);
+    if (!hasSwizzling && !LLVM::AMD::canCoalesceWriteIntoSharedMemory(
+                             rewriter, srcToSharedLayout, threadsPerWarp)) {
+      return rewriter.notifyMatchFailure(
+          op, "does not write coalesced into LDS and is not swizzled");
+    }
+
+    if (hasSwizzling && !LLVM::AMD::doesSwizzleInsideWarp(
+                            rewriter, srcToSharedLayout, threadsPerWarp)) {
       return rewriter.notifyMatchFailure(op,
-                                         "does not write coalesced into LDS");
+                                         "does swizzle across warp boundaries");
     }
 
     auto resElemTy = getTypeConverter()->convertType(dstTy.getElementType());
     auto smemObj = mlir::LLVM::getSharedMemoryObjectFromStruct(
         loc, llDst, resElemTy, rewriter);
 
-    // First we determine the vector size per load and collect the
-    // shared addresses. This will only emit the address calculation and not the
-    // actual loads
+    auto emitSharedAddresses = [&](RankedTensorType srcTy, MemDescType dstTy,
+                                   SmallVector<Value> &shmemAddrs,
+                                   VectorType &vecTy) {
+      bool ok = emitTransferBetweenRegistersAndShared(
+          ptrType, dstTy, resElemTy, {}, smemObj, loc, rewriter, targetInfo,
+          [&](VectorType vecTy_, Value shmemAddr) {
+            vecTy = vecTy_;
+            shmemAddrs.push_back(shmemAddr);
+          });
+      assert(ok);
+    };
+
+    // Determine the vector size per load and collect the shared addresses. This
+    // will only emit the address calculation and not the actual loads.
+    // For swizzled loads we get the non swizzled/coalesced shared addresses
+    // from a temporary non swizzled layout. Those addresses will be used as the
+    // store addresses. Additionally, we compute the swizzled shared memory
+    // addresses which will be used to compute which lane holds the global ptr
+    // to the coalesced address
     VectorType vecTy;
-    SmallVector<Value> shmemAddrs;
-    bool ok = emitTransferBetweenRegistersAndShared(
-        ptrType, dstTy, resElemTy, {}, smemObj, loc, rewriter, targetInfo,
-        [&](VectorType vecTy_, Value shmemAddr) {
-          vecTy = vecTy_;
-          shmemAddrs.push_back(shmemAddr);
-        });
-    assert(ok);
+    SmallVector<Value> coalescedShmemAddr;
+    SmallVector<Value> swizzledShmemAddr;
+
+    if (!hasSwizzling) {
+      emitSharedAddresses(ptrType, dstTy, coalescedShmemAddr, vecTy);
+    } else {
+      emitSharedAddresses(ptrType, dstTy, swizzledShmemAddr, vecTy);
+      // Create non swizzled/coalesced encoding
+      auto dstEnc = cast<SwizzledSharedEncodingAttr>(dstTy.getEncoding());
+      auto flatSharedEnc = SwizzledSharedEncodingAttr::get(
+          getContext(), dstEnc.getVec(), 1, 1, dstEnc.getOrder(),
+          dstEnc.getCTALayout());
+      auto flatDstTy =
+          MemDescType::get(dstTy.getShape(), dstTy.getElementType(),
+                           flatSharedEnc, dstTy.getMemorySpace());
+      VectorType coalescedVecTy;
+      emitSharedAddresses(ptrType, flatDstTy, coalescedShmemAddr,
+                          coalescedVecTy);
+      assert(coalescedVecTy == vecTy);
+    }
+    assert(vecTy.getNumElements() == vec);
 
     int vecBits = vecTy.getNumElements() * vecTy.getElementTypeBitWidth();
     if (!targetInfo.supportsDirectToLdsLoadBitWidth(vecBits)) {
@@ -462,17 +521,43 @@ struct BufferLoadToLocalOpConversion
     // based on the collected shared addresses and vector size
     Value rsrcDesc = bufferEmitter.createResourceDescriptor(llPtr, llStride);
 
-    for (int i = 0; i < shmemAddrs.size(); i++) {
+    for (int i = 0; i < coalescedShmemAddr.size(); i++) {
       auto srcIdx = i * vec;
       auto offsetIn = offsetElems[srcIdx];
-
       Value pred = mask ? maskElems[srcIdx] : b.true_val();
+
+      if (hasSwizzling) {
+        // Compute the laneOffset based on the difference in elements between
+        // the two shmem addresses. laneOffset will be negative for half the
+        // lanes because a smaller laneId might hold our global_ptr.
+        auto coalescedAddr = b.ptrtoint(i64_ty, coalescedShmemAddr[i]);
+        auto swizzledAddr = b.ptrtoint(i64_ty, swizzledShmemAddr[i]);
+        auto diff = b.trunc(i32_ty, b.sub(swizzledAddr, coalescedAddr));
+        Value laneOffset = b.sdiv(diff, vecBytesVal);
+        // selectLane will always stay inside the warp [0,
+        // threadsPerWarp) because we only swizzle inside a warp
+        Value selectLane = b.add(getLaneId(rewriter, loc), laneOffset);
+
+        offsetIn = targetInfo.shuffleIdx(rewriter, loc, offsetIn, selectLane);
+
+        if (mask) {
+          // To swizzle the mask we can use ballot and then select the bit based
+          // on the lane id
+          auto warpMask =
+              targetInfo.ballot(rewriter, loc, rewriter.getI64Type(), pred);
+          // Extract the selectLane bit
+          auto bitMask =
+              b.lshr(warpMask, b.zext(rewriter.getI64Type(), selectLane));
+          pred = b.trunc(i1_ty, bitMask);
+        }
+      }
+
       bufferEmitter.emitLoadToLds(vecTy, vecBytesVal, rsrcDesc, offsetIn,
-                                  shmemAddrs[i], pred, op.getCache());
+                                  coalescedShmemAddr[i], pred, op.getCache());
       if (!otherElems.empty()) {
         Value storeVal = packElementRangeIntoVector(
             rewriter, this->getTypeConverter(), loc, vecTy, otherElems, srcIdx);
-        llStore(rewriter, loc, shmemAddrs[i], storeVal,
+        llStore(rewriter, loc, coalescedShmemAddr[i], storeVal,
                 b.icmp_ne(maskElems[srcIdx], b.true_val()), op.getCache());
       }
     }
@@ -534,11 +619,19 @@ struct AsyncCopyGlobalToLocalOpConversion
     auto maskElements = getMaskElemsAndUpdateVeclen(
         rewriter, loc, adaptor.getMask(), op.getMask(), maxVec);
 
+    auto shape = srcTy.getShape();
+    LinearLayout srcLayout =
+        triton::gpu::toLinearLayout(shape, srcTy.getEncoding());
+    LinearLayout sharedLayout =
+        triton::gpu::toLinearLayout(shape, dstTy.getEncoding());
+    LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+
     // global.load.lds does not support per lane offsets.
     // We need to ensure that we write coalesced into shared memory. This means
     // that the kLane dim needs to be contigeous based on the vector size.
-    if (!LLVM::AMD::canCoalesceWriteIntoSharedMemory(rewriter, srcTy, dstTy,
-                                                     maxVec)) {
+    unsigned threadsPerWarp = lookupThreadsPerWarp(rewriter);
+    if (!LLVM::AMD::canCoalesceWriteIntoSharedMemory(
+            rewriter, srcToSharedLayout, threadsPerWarp)) {
       return rewriter.notifyMatchFailure(op,
                                          "does not write coalesced into LDS");
     }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -605,20 +605,14 @@ Type scaleDotElemTypeToMLIRType(MLIRContext *ctx, triton::ScaleDotElemType t) {
 }
 
 bool canCoalesceWriteIntoSharedMemory(RewriterBase &rewriter,
-                                      RankedTensorType srcTy,
-                                      triton::gpu::MemDescType dstTy,
-                                      unsigned vectorSize) {
-  auto shape = srcTy.getShape();
-  LinearLayout srcLayout =
-      triton::gpu::toLinearLayout(shape, srcTy.getEncoding());
-  LinearLayout sharedLayout =
-      triton::gpu::toLinearLayout(shape, dstTy.getEncoding());
-  LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+                                      const LinearLayout &srcToSharedLayout,
+                                      unsigned threadsPerWarp) {
+  auto contig = srcToSharedLayout.getNumConsecutiveInOut();
 
   StringAttr kLane = rewriter.getStringAttr("lane");
   for (int inLane : llvm::seq(srcToSharedLayout.getInDimSizeLog2(kLane))) {
     auto basis = srcToSharedLayout.getBasis(kLane, inLane)[0];
-    unsigned expected = vectorSize * (1 << inLane);
+    unsigned expected = contig * (1 << inLane);
     if (basis != expected) {
       LDBG("detected uncoalesced layout from blocked to shared in async copy "
            "for lane "
@@ -627,6 +621,42 @@ bool canCoalesceWriteIntoSharedMemory(RewriterBase &rewriter,
       return false;
     }
   }
+  // Additionally we could swizzle based on the warp dimension so we need to
+  // check that when all bases are divided by contig, none of the first
+  // (log2(warpSize) + 1) bits are set to 1
+  assert(llvm::isPowerOf2_32(threadsPerWarp));
+  assert(llvm::isPowerOf2_32(contig));
+  unsigned mask = (threadsPerWarp * contig) - 1;
+  StringAttr kWarp = rewriter.getStringAttr("warp");
+  for (int inWarp : llvm::seq(srcToSharedLayout.getInDimSizeLog2(kWarp))) {
+    auto basis = srcToSharedLayout.getBasis(kWarp, inWarp)[0];
+    if ((basis & mask) != 0) {
+      LDBG("detected uncoalesced layout from blocked to shared in async copy "
+           "for warp "
+           << inWarp);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool doesSwizzleInsideWarp(RewriterBase &rewriter,
+                           const LinearLayout &srcToSharedLayout,
+                           unsigned threadsPerWarp) {
+  auto contig = srcToSharedLayout.getNumConsecutiveInOut();
+  // If all bases in lane dimension are below threadsPerWarp multiplied with the
+  // contiguity we do not swizzle across warp boundaries.
+  assert(llvm::isPowerOf2_32(threadsPerWarp));
+  unsigned upperLimit = threadsPerWarp * contig;
+
+  StringAttr kLane = rewriter.getStringAttr("lane");
+  for (int inLane : llvm::seq(srcToSharedLayout.getInDimSizeLog2(kLane))) {
+    auto basis = srcToSharedLayout.getBasis(kLane, inLane)[0];
+    if (basis >= upperLimit) {
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -88,9 +88,14 @@ Type scaleDotElemTypeToMLIRType(MLIRContext *ctx, triton::ScaleDotElemType t);
 // Returns true if we can perform coalesced write from the source encoding to
 // the destination encoding.
 bool canCoalesceWriteIntoSharedMemory(RewriterBase &rewriter,
-                                      RankedTensorType srcTy,
-                                      triton::gpu::MemDescType dstTy,
-                                      unsigned vectorSize);
+                                      const LinearLayout &srcToSharedLayout,
+                                      unsigned threadsPerWarp);
+
+// Returns true if the swizzling pattern does only swizzle the shared memory
+// offsets of a warp and does not exchange destination elements across warps
+bool doesSwizzleInsideWarp(RewriterBase &rewriter,
+                           const LinearLayout &srcToSharedLayout,
+                           unsigned threadsPerWarp);
 
 // Return true if op is used by DotScaledOp or UpcastMXFPOp ops.
 bool isUsedByDotScaledOp(Operation *op);