support buffer_load_to_local

binarman · binarman · commit 226f8efc7b54 · 2025-10-28T22:08:54.000Z
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops-base-ptr-increment.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops-base-ptr-increment.mlir
@@ -46,6 +46,39 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
 
 // -----
 
+// COMMON-LABEL: buffer_load_to_local
+// COMMON-DAG: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
+// COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}}
+// COMMON:   amdgpu.buffer_load_to_local [[X_BASE]]{{\[}}[[X_OFFSET_CST]]{{\]}}
+// COMMON:   [[NEXT_X_BASE:%.*]] = tt.addptr [[X_BASE]], %c64_i32
+// COMMON:   scf.yield {{.*}}, [[NEXT_X_BASE]]
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @buffer_load_to_local(%X: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<64> : tensor<16x64xi32, #blocked>
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+
+    %Xoffset_init = arith.constant dense<123> : tensor<16x64xi32, #blocked>
+
+    %x_dummy_buffer = ttg.local_alloc : () -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
+
+    %for = scf.for %idx = %c0 to %c128 step %c1 iter_args(%Xoffset = %Xoffset_init) -> (tensor<16x64xi32, #blocked>) {
+      %x = amdgpu.buffer_load_to_local %X[%Xoffset] into %x_dummy_buffer : <f16>[tensor<16x64xi32, #blocked>] -> !ttg.memdesc<16x64xf16, #shared, #smem, mutable, 16x64>
+
+      %Xoffset_next = arith.addi %Xoffset, %cst : tensor<16x64xi32, #blocked>
+      scf.yield %Xoffset_next : tensor<16x64xi32, #blocked>
+    }
+    tt.return
+  }
+}
+
+// -----
+
 // COMMON-LABEL: add_before_load
 // COMMON-DAG: [[X_OFFSET_CST:%.*]] = arith.constant dense<123>
 // COMMON: scf.for {{.*}} iter_args({{.*}}, [[X_BASE:%.*]] = {{.*}})
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeBufferOpPtr.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeBufferOpPtr.cpp
@@ -25,6 +25,7 @@ using mlir::triton::AMD::ISAFamily;
 
 namespace ttg = mlir::triton::gpu;
 namespace tt = mlir::triton;
+namespace amdttg = mlir::triton::amdgpu;
 
 namespace mlir {
 
@@ -106,19 +107,47 @@ struct AdvanceBasePointer : public OpRewritePattern<scf::ForOp> {
   // optimization offsetInitialized is a value of offset on first loop iteration
   // incrementOp is an operation that advances offset tensor
   struct LoadData {
-    triton::amdgpu::BufferLoadOp load;
+    Operation *load;
     Value offsetIncrement;
     Value baseIncrement;
     Value offsetInitializer;
     Operation *incrementOp;
   };
 
+  static Value getOffset(Operation *load) {
+    if (auto specific = dyn_cast<amdttg::BufferLoadOp>(load))
+      return specific.getOffsets();
+    if (auto specific = dyn_cast<amdttg::BufferLoadToLocalOp>(load))
+      return specific.getOffsets();
+    assert(false && "unsupported operation type");
+  }
+
+  static Value getBasePtr(Operation *load) {
+    if (auto specific = dyn_cast<amdttg::BufferLoadOp>(load))
+      return specific.getPtr();
+    if (auto specific = dyn_cast<amdttg::BufferLoadToLocalOp>(load))
+      return specific.getPtr();
+    assert(false && "unsupported operation type");
+  }
+
+  static void setOffset(Operation *load, Value newOffset) {
+    assert((isa<amdttg::BufferLoadOp, amdttg::BufferLoadToLocalOp>(load)));
+    const int offsetIdx = isa<amdttg::BufferLoadOp>(load) ? 1 : 2;
+    load->setOperand(offsetIdx, newOffset);
+  }
+
+  static void setBasePtr(Operation *load, Value newBasePtr) {
+    assert((isa<amdttg::BufferLoadOp, amdttg::BufferLoadToLocalOp>(load)));
+    const int ptrIdx = isa<amdttg::BufferLoadOp>(load) ? 0 : 1;
+    load->setOperand(ptrIdx, newBasePtr);
+  }
+
   // Perform series of checks to decide if given operation could be optimized.
   // If optimization is possible, return filled LoadData
-  static std::optional<LoadData>
-  analyzeLoad(triton::amdgpu::BufferLoadOp loadOp, scf::ForOp targetFor) {
-    LDBG("Analyzing: " << loadOp);
-    Value maybeOffsetsBlockArg = loadOp.getOffsets();
+  static std::optional<LoadData> analyzeLoad(Operation *loadOp,
+                                             scf::ForOp targetFor) {
+    LDBG("Analyzing: " << *loadOp);
+    Value maybeOffsetsBlockArg = getOffset(loadOp);
     auto maybeOffsetDefOp = maybeOffsetsBlockArg.getDefiningOp();
     if (maybeOffsetDefOp && isa<arith::AddIOp>(maybeOffsetDefOp)) {
       for (auto &use : maybeOffsetDefOp->getUses()) {
@@ -145,7 +174,7 @@ struct AdvanceBasePointer : public OpRewritePattern<scf::ForOp> {
       LDBG("Rejected: expect load offset to be a target loop argument");
       return {};
     }
-    auto basePtr = loadOp.getPtr();
+    auto basePtr = getBasePtr(loadOp);
     auto defOpBlock = basePtr.getParentBlock();
     if (!defOpBlock->getParentOp()->isProperAncestor(targetFor)) {
       LDBG("Rejected: expect load base Ptr to be invariant to the loop");
@@ -195,7 +224,7 @@ struct AdvanceBasePointer : public OpRewritePattern<scf::ForOp> {
   }
 
   static bool isAddFirst(LoadData &ld) {
-    return ld.load.getOffsets().getDefiningOp() == ld.incrementOp;
+    return getOffset(ld.load).getDefiningOp() == ld.incrementOp;
   }
 
   static scf::ForOp
@@ -204,7 +233,7 @@ struct AdvanceBasePointer : public OpRewritePattern<scf::ForOp> {
     // Create new loop with additional arguments
     llvm::SmallVector<Value> newLoopArgs(forOp.getInitArgs());
     for (auto loadData : loads) {
-      newLoopArgs.push_back(loadData.load.getPtr());
+      newLoopArgs.push_back(getBasePtr(loadData.load));
     }
     rewriter.setInsertionPoint(forOp);
     auto newForOp = rewriter.create<scf::ForOp>(
@@ -255,32 +284,35 @@ struct AdvanceBasePointer : public OpRewritePattern<scf::ForOp> {
     // Replace base ptr with incrementing value
     for (auto [loadData, basePtr, nextBasePtr] :
          llvm::zip(loads, basePtrs, nextIterBasePtrs)) {
-      auto newLoad = cast<triton::amdgpu::BufferLoadOp>(
-          mapping.lookup<Operation *>(loadData.load));
-      constexpr int ptrIdx = 0;
-      constexpr int offsetIdx = 1;
-      newLoad.setOperand(offsetIdx, loadData.offsetInitializer);
+      auto newLoad = mapping.lookup<Operation *>(loadData.load);
+      setOffset(newLoad, loadData.offsetInitializer);
       // two cases:
       // 1. first advance pointer, then load
       // 2. load uses pointers from loop arguments, advanced pointer used on
       // next iteration
       Value advancingBasePtr = isAddFirst(loadData) ? nextBasePtr : basePtr;
-      newLoad.setOperand(ptrIdx, advancingBasePtr);
+      setBasePtr(newLoad, advancingBasePtr);
     }
     return newForOp;
   }
 
-  LogicalResult matchAndRewrite(scf::ForOp forOp,
-                                PatternRewriter &rewriter) const override {
-    LDBG("Analyzing ForOp for for offset pointer optimization: " << forOp);
-    // Gather buffer loads which could be optimized
-    SmallVector<LoadData> loads;
-    forOp.walk([&loads, forOp](triton::amdgpu::BufferLoadOp loadOp) {
+  template <typename OpType>
+  static void collectLoads(SmallVector<LoadData> &loads, scf::ForOp forOp) {
+    forOp.walk([&loads, forOp](OpType loadOp) {
       auto loadData = analyzeLoad(loadOp, forOp);
       if (loadData.has_value()) {
         loads.push_back(loadData.value());
       }
     });
+  }
+
+  LogicalResult matchAndRewrite(scf::ForOp forOp,
+                                PatternRewriter &rewriter) const override {
+    LDBG("Analyzing ForOp for for offset pointer optimization: " << forOp);
+    // Gather buffer loads which could be optimized
+    SmallVector<LoadData> loads;
+    collectLoads<triton::amdgpu::BufferLoadOp>(loads, forOp);
+    collectLoads<triton::amdgpu::BufferLoadToLocalOp>(loads, forOp);
 
     if (loads.empty())
       return rewriter.notifyMatchFailure(forOp, "no suitable buffer loads");