[AMD] Added local_alloc refinement

ravil-mobile · ravil-mobile · commit 4fb6cc217304 · 2025-06-23T10:57:55.000Z
diff --git a/test/TritonGPU/amd/ops-refinement/local_alloc.mlir b/test/TritonGPU/amd/ops-refinement/local_alloc.mlir
@@ -0,0 +1,35 @@
+// RUN: triton-opt %s -split-input-file -triton-amdgpu-refine-ops='arch=gfx942' -canonicalize | FileCheck %s
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#smem = #ttg.shared_memory
+
+
+// CHECK-LABEL: @local_alloc_refinement
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 16384 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @local_alloc_refinement(%arg0: tensor<64x16xf16, #blocked>) attributes {noinline = false} {
+
+    // CHECK: [[OFFSET_12:%.*]] = arith.constant 12 : i32
+    // CHECK: [[OFFSET_8:%.*]] = arith.constant 8 : i32
+    // CHECK: [[OFFSET_4:%.*]] = arith.constant 4 : i32
+    // CHECK: [[OFFSET_0:%.*]] = arith.constant 0 : i32
+    // CHECK: [[ALLOC:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+    // CHECK: [[SUBVIEW_0:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_0]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_0:%.*]] = amdgpu.extract_slice %arg0 [0, 0] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_0]], [[SUBVIEW_0]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_1:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_4]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_1:%.*]] = amdgpu.extract_slice %arg0 [0, 4] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_1]], [[SUBVIEW_1]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_2:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_8]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_2:%.*]] = amdgpu.extract_slice %arg0 [0, 8] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_2]], [[SUBVIEW_2]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_3:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_12]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_3:%.*]] = amdgpu.extract_slice %arg0 [0, 12] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_3]], [[SUBVIEW_3]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: amdgpu.instruction_sched_hint {isBufferLoadsAEnabled = false, isBufferLoadsBEnabled = false, numDsReadsA = #amdgpu.InstCounter<0, none>, numDsReadsB = #amdgpu.InstCounter<0, none>, numDsWritesA = #amdgpu.InstCounter<0, none>, numDsWritesB = #amdgpu.InstCounter<0, none>, numGlobalLoadsA = #amdgpu.InstCounter<0, none>, numGlobalLoadsB = #amdgpu.InstCounter<0, none>, numMMAs = #amdgpu.InstCounter<0, none>, variant = #amdgpu.SchedHintVariant<refine_ops>}
+    // CHECK: ttg.local_dealloc [[ALLOC]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+    %0 = ttg.local_alloc %arg0 : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
+    amdgpu.instruction_sched_hint {isBufferLoadsAEnabled = false, isBufferLoadsBEnabled = false, numDsReadsA = #amdgpu.InstCounter<0, none>, numDsReadsB = #amdgpu.InstCounter<0, none>, numDsWritesA = #amdgpu.InstCounter<0, none>, numDsWritesB = #amdgpu.InstCounter<0, none>, numGlobalLoadsA = #amdgpu.InstCounter<0, none>, numGlobalLoadsB = #amdgpu.InstCounter<0, none>, numMMAs = #amdgpu.InstCounter<0, none>, variant = #amdgpu.SchedHintVariant<refine_ops>}
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/RefineOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/RefineOps.cpp
@@ -692,6 +692,92 @@ struct LocalStoreOpPattern
   }
 };
 
+struct LocalAllocOpPattern
+    : public RefineRewritePattern<triton::gpu::LocalAllocOp> {
+  LocalAllocOpPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : RefineRewritePattern(context, benefit) {}
+
+  // Refines non-mutable memory `LocalAllocOp` ops. The non-mutable variant
+  // is used as a not-pipelined version of the op. To be able to refine the op,
+  // we replace the non-mutable variant with the mutable one that requires
+  // `LocalDeallocOp` after the last user of the result of `LocalAllocOp`.
+  // The `LocalStoreOp` is used to move data from registers to the LDS.
+  // The refinement of the resulting `LocalStoreOp` is left to the dedicated
+  // rewrite pattern.
+  LogicalResult apply(triton::gpu::LocalAllocOp op,
+                      PatternRewriter &rewriter) const override {
+    auto ctx = op->getContext();
+    auto loc = op.getLoc();
+    auto alignment = op.getAlignment();
+
+    if (op->getNumOperands() == 0)
+      return failure();
+
+    auto allocType = cast<triton::gpu::MemDescType>(op.getResult().getType());
+    auto origShape = allocType.getShape();
+    SmallVector<int64_t> newShape(origShape);
+    SmallVector<int64_t> newAllocShape(allocType.getAllocShape());
+
+    if (newShape.size() == 2) {
+      newShape.insert(newShape.begin(), 1);
+    }
+    assert(newShape.size() == 3);
+
+    if (newAllocShape.size() == 2) {
+      newAllocShape.insert(newAllocShape.begin(), 1);
+    }
+    assert(newAllocShape.size() == 3);
+
+    auto newAllocType = triton::gpu::MemDescType::get(
+        ctx, newShape, allocType.getElementType(), allocType.getEncoding(),
+        allocType.getMemorySpace(),
+        /*mutableMemory=*/true, newAllocShape);
+
+    rewriter.setInsertionPointAfter(op);
+    auto newAlloc =
+        rewriter.create<triton::gpu::LocalAllocOp>(loc, newAllocType);
+    newAlloc->setAttrs(op->getAttrs());
+
+    auto newSubviewType = triton::gpu::MemDescType::get(
+        ctx, origShape, allocType.getElementType(), allocType.getEncoding(),
+        allocType.getMemorySpace(),
+        /*mutableMemory=*/true, newAllocShape);
+
+    auto offset = createOffset({}, {0, 0, 0}, rewriter, loc);
+    auto newSubview = rewriter.create<ttg::MemDescSubviewOp>(
+        loc, newSubviewType, newAlloc, offset);
+    rewriter.create<ttg::LocalStoreOp>(loc, op.getOperand(0), newSubview);
+
+    mlir::Operation *lastUser = nullptr;
+    for (auto *user : op.getResult().getUsers()) {
+      if (!lastUser || user->isBeforeInBlock(lastUser) == false) {
+        lastUser = user;
+      }
+    }
+
+    Operation &lastOpInBlock = op->getBlock()->back();
+    const bool noUsers = lastUser == nullptr;
+    const bool isLastInstr = noUsers
+                                 ? false
+                                 : mlir::OperationEquivalence::isEquivalentTo(
+                                       &lastOpInBlock, lastUser,
+                                       mlir::OperationEquivalence::Flags::None);
+    ;
+    if (noUsers || isLastInstr) {
+      rewriter.setInsertionPoint(&lastOpInBlock);
+    } else {
+      rewriter.setInsertionPointAfter(lastUser);
+    }
+
+    rewriter.create<triton::gpu::LocalDeallocOp>(loc, newAlloc.getResult());
+
+    op.replaceAllUsesWith(newSubview.getResult());
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+};
+
 struct ReduceOpPattern : public RefineRewritePattern<triton::ReduceOp> {
   ReduceOpPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : RefineRewritePattern(context, benefit) {}
@@ -1134,6 +1220,10 @@ struct TritonAMDGPURefineOps
       return signalPassFailure();
     }
 
+    RewritePatternSet primaryPatterns(context);
+    primaryPatterns.add<LocalAllocOpPattern>(context, /*benefit=*/1);
+    walkAndApplyPatterns(func, std::move(primaryPatterns));
+
     RewritePatternSet patterns(context);
     patterns.add<LocalLoadOpPattern>(context, /*benefit=*/1);
     patterns.add<DotOpPattern>(context, /*benefit=*/1);