[AMD] Introduce specialized Allocation pass (#7328)

alefimov-amd · binarman · web-flow · commit d0abc51ac3fb · 2025-06-30T11:42:18.000-07:00
This PR introduces AMD specific allocation pass and new attribute that
defines conversion method: padded or swizzled.
For now OptimizeLDSUsage pass sets all convert layout operations in
padded mode.

---------

Co-authored-by: Alexander Efimov &lt;efimov.alexander@gmail.com&gt;
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -69,6 +69,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerLLVMDIScope();
 
   // TritonAMDGPUToLLVM passes
+  mlir::triton::registerAllocateAMDGPUSharedMemory();
   mlir::triton::registerConvertTritonAMDGPUToLLVM();
   mlir::triton::registerConvertBuiltinFuncToLLVM();
   mlir::triton::registerOptimizeAMDLDSUsage();
diff --git a/include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h b/include/triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h
@@ -0,0 +1,17 @@
+#ifndef TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
+#define TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
+
+#include "mlir/IR/BuiltinOps.h"
+#include "triton/Analysis/Allocation.h"
+
+namespace mlir::triton::gpu {
+
+/// Attach shared memory related attributes to module and operations inside it.
+/// This includes total shared memory consumption in module and shared memory
+/// offsets of buffers associated with operations.
+void attachAllocationSizeAndOffsetAttr(ModuleOp mod,
+                                       ModuleAllocation &allocation);
+
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_CONVERSION_TRITON_GPU_TO_LLVM_ALLOCATE_UTILITY_H_
diff --git a/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemory.cpp b/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemory.cpp
@@ -1,5 +1,6 @@
 #include "triton/Analysis/Allocation.h"
 #include "triton/Analysis/Utility.h"
+#include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -18,32 +19,9 @@ struct AllocateSharedMemory
           AllocateSharedMemory> {
   void runOnOperation() override {
     ModuleOp mod = getOperation();
-    MLIRContext *ctx = &getContext();
     ModuleAllocation allocation(mod);
 
-    mod.walk<mlir::WalkOrder::PreOrder>([&](FunctionOpInterface funcOp) {
-      auto *funcAllocation = allocation.getFuncData(funcOp);
-      funcOp.walk([&](Operation *op) {
-        auto oBufferId = funcAllocation->getBufferId(op);
-        int offset = -1;
-        if (oBufferId != Allocation::InvalidBufferId)
-          offset = funcAllocation->getOffset(oBufferId);
-        else if (op->getNumResults() == 1) {
-          Value value = op->getResult(0);
-          auto vBufferId = funcAllocation->getBufferId(value);
-          if (vBufferId != Allocation::InvalidBufferId)
-            offset = funcAllocation->getOffset(vBufferId);
-        }
-        if (offset == -1)
-          return;
-        op->setAttr("allocation.offset",
-                    IntegerAttr::get(IntegerType::get(ctx, 32), offset));
-      });
-      return WalkResult::skip();
-    });
-    mod->setAttr("ttg.shared",
-                 mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 32),
-                                        allocation.getSharedMemorySize()));
+    mlir::triton::gpu::attachAllocationSizeAndOffsetAttr(mod, allocation);
   }
 };
 } // namespace
diff --git a/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.cpp b/lib/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.cpp
@@ -0,0 +1,34 @@
+#include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
+
+namespace mlir::triton::gpu {
+
+void attachAllocationSizeAndOffsetAttr(ModuleOp mod,
+                                       ModuleAllocation &allocation) {
+  MLIRContext *ctx = mod.getContext();
+
+  mod.walk<mlir::WalkOrder::PreOrder>([&](FunctionOpInterface funcOp) {
+    auto *funcAllocation = allocation.getFuncData(funcOp);
+    funcOp.walk([&](Operation *op) {
+      auto oBufferId = funcAllocation->getBufferId(op);
+      int offset = -1;
+      if (oBufferId != Allocation::InvalidBufferId)
+        offset = funcAllocation->getOffset(oBufferId);
+      else if (op->getNumResults() == 1) {
+        Value value = op->getResult(0);
+        auto vBufferId = funcAllocation->getBufferId(value);
+        if (vBufferId != Allocation::InvalidBufferId)
+          offset = funcAllocation->getOffset(vBufferId);
+      }
+      if (offset == -1)
+        return;
+      op->setAttr("allocation.offset",
+                  IntegerAttr::get(IntegerType::get(ctx, 32), offset));
+    });
+    return WalkResult::skip();
+  });
+  mod->setAttr("ttg.shared",
+               mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 32),
+                                      allocation.getSharedMemorySize()));
+}
+
+} // namespace mlir::triton::gpu
diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
@@ -2,6 +2,7 @@ add_triton_library(TritonGPUToLLVM
     DotOpToLLVM/FMA.cpp
     DotOpToLLVM/FMADotUtility.cpp
     AllocateSharedMemory.cpp
+    AllocateSharedMemoryUtility.cpp
     AllocateWarpGroups.cpp
     AssertOpToLLVM.cpp
     ControlFlowOpToLLVM.cpp
diff --git a/lib/Tools/GenericSwizzling.cpp b/lib/Tools/GenericSwizzling.cpp
@@ -382,4 +382,5 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,
 
   return basis1D.reshapeOuts(src.getOutDims());
 }
+
 } // namespace mlir::triton::gpu
diff --git a/test/Conversion/amd/allocate_shared_memory.mlir b/test/Conversion/amd/allocate_shared_memory.mlir
@@ -0,0 +1,54 @@
+// RUN: triton-opt %s -split-input-file --allocate-amdgpu-shared-memory | FileCheck %s
+
+#blocked1 = #ttg.blocked<{sizePerThread = [8, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+
+// This test checks padding based converter.
+//
+// Converter allocates temporary buffer, stores and reads parts or tensor in few transactions, which are named repeats.
+// Size of temporary buffer is computed using the following algorithm:
+// - get CTA tile shape of blocked1 layout: [8*8*4, 4*8*1] = [256, 32]
+// - get CTA tile shape of blocked2 layout: [1*8*4, 1*8*1] = [32, 8]
+// - compute common tile shape is [max(256, 32), max(32, 8)] = [256, 32].
+// - pad fastest dimension(same as output layout, 1 in this case) with size of memory access to reduce bank conflicts. 16 bytes in this case.
+//
+// Therefore total memory consuption for scratch buffer is 256*(32 * 4(size of one element) + 16(padding)) = 36864 bytes
+//
+// For implementation see mlir::triton::getNumScratchElemsPaddedCvt function.
+
+// CHECK: ttg.shared = 36864 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+
+// CHECK-LABEL: @convert_layout_padded
+tt.func @convert_layout_padded(%arg0: tensor<256x256xi32, #blocked1>) {
+  // CHECK-NEXT: allocation.offset = 0 : i32
+  %0 = ttg.convert_layout %arg0 {amdgpu.use_padded_scratch_shmem} : tensor<256x256xi32, #blocked1> -> tensor<256x256xi32, #blocked2>
+  tt.return
+}
+
+}
+
+// -----
+
+#blocked1 = #ttg.blocked<{sizePerThread = [8, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+
+// This test checks swizzling based converter.
+//
+// Swizzling converter tries to find swizzling pattern, which provides widest load and store instructions and avoids as much back conflicts as possible.
+// Current converter implementation decides that best swizzling patter requires allocation of tile with shape [256, 128], which takes 256*128*4(size of one element) = 131072 bytes
+//
+// For implementation see mlir::triton::getNumScratchElemsSwizzledCvt function,
+// in particular mlir::triton::gpu::optimalSwizzling to get shape of repeat tile.
+
+// CHECK: ttg.shared = 131072 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+
+// CHECK-LABEL: @convert_layout_swizzled
+tt.func @convert_layout_swizzled(%arg0: tensor<256x256xi32, #blocked1>) {
+  // CHECK-NEXT: allocation.offset = 0 : i32
+  %0 = ttg.convert_layout %arg0 : tensor<256x256xi32, #blocked1> -> tensor<256x256xi32, #blocked2>
+  tt.return
+}
+
+}
diff --git a/test/TritonGPU/amd/optimize-lds-usage.mlir b/test/TritonGPU/amd/optimize-lds-usage.mlir
@@ -5,8 +5,8 @@
 // CHECK-LABEL: alloc_convert_load
 // CHECK-32KLIMIT-LABEL: alloc_convert_load
 // CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
-// CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
-// CHECK: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
+// CHECK: %1 = ttg.convert_layout %arg1 {{.*}}: {{.*}}#blocked{{.*}}#blocked1
+// CHECK: %2 = ttg.convert_layout %1 {{.*}}: {{.*}}#blocked1{{.*}}#mma
 // CHECK: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
@@ -28,8 +28,8 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 // CHECK-LABEL: alloc_convert_small_load
 // CHECK-32KLIMIT-LABEL: alloc_convert_small_load
 // CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
-// CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
-// CHECK: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
+// CHECK: %1 = ttg.convert_layout %arg1 {{.*}}: {{.*}}#blocked{{.*}}#blocked1
+// CHECK: %2 = ttg.convert_layout %1 {{.*}}: {{.*}}#blocked1{{.*}}#mma
 // CHECK: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
@@ -55,7 +55,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 // CHECK-32KLIMIT-LABEL: alloc_convert_3d_load
 // CHECK: [[V0:%.*]] = ttg.local_alloc {{.*}}[[$BLOCKED1]]{{.*}}
 // CHECK: [[V1:%.*]] = ttg.convert_layout {{.*}}[[$BLOCKED1]]{{.*}}[[$BLOCKED2]]
-// CHECK: [[V2:%.*]] = ttg.convert_layout [[V1]] : {{.*}}[[$BLOCKED2]]{{.*}}[[$MMA]]
+// CHECK: [[V2:%.*]] = ttg.convert_layout [[V1]] {{.*}}: {{.*}}[[$BLOCKED2]]{{.*}}[[$MMA]]
 // CHECK: [[V3:%.*]] = ttg.local_load [[V0]] : {{.*}}#ttg.dot_op<{opIdx = 0, parent = [[$MMA]], kWidth = 4}>>
 #blocked = #ttg.blocked<{sizePerThread = [1, 8, 1], threadsPerWarp = [1, 16, 4], warpsPerCTA = [1, 1, 8], order = [0, 1, 2]}>
 #mma = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 1, 8], instrShape = [32, 32], isTransposed = false}>
@@ -75,12 +75,12 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 // Check that optimization triggers with custom LDS limit and do not triggers with default one
 // CHECK-LABEL: alloc_convert_32k_limit
 // CHECK: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
-// CHECK: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#mma
+// CHECK: %1 = ttg.convert_layout %arg1 {{.*}}: {{.*}}#blocked{{.*}}#mma
 // CHECK: %2 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
 // CHECK-32KLIMIT-LABEL: alloc_convert_32k_limit
 // CHECK-32KLIMIT: %0 = ttg.local_alloc %arg0 : {{.*}}#blocked{{.*}}#shared
-// CHECK-32KLIMIT: %1 = ttg.convert_layout %arg1 : {{.*}}#blocked{{.*}}#blocked1
-// CHECK-32KLIMIT: %2 = ttg.convert_layout %1 : {{.*}}#blocked1{{.*}}#mma
+// CHECK-32KLIMIT: %1 = ttg.convert_layout %arg1 {{.*}}: {{.*}}#blocked{{.*}}#blocked1
+// CHECK-32KLIMIT: %2 = ttg.convert_layout %1 {{.*}}: {{.*}}#blocked1{{.*}}#mma
 // CHECK-32KLIMIT: %3 = ttg.local_load %0 : {{.*}}#shared{{.*}}#ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
 #blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
@@ -106,9 +106,9 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32}
 
 // CHECK: tt.func public @mfma_dot_shortcut([[ARG_0:%[a-z0-9]*]]: {{.*}}, [[ARG_1:%[a-z0-9]*]]: {{.*}}, [[ARG_2:%[a-z0-9]*]]: {{.*}})
 // CHECK: [[ALLOC:%[0-9]+]] = ttg.local_alloc [[ARG_0]] : (tensor<128x128xf16, [[BLOCKED_1]]>) -> !ttg.memdesc<128x128xf16, [[SHARED]], #smem>
-// CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] : tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
-// CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] : tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
-// CHECK: [[CONVERT_2:%[0-9]+]] = ttg.convert_layout [[ARG_2]] : tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
+// CHECK: [[INTERMEDIATE_CONV:%[0-9]+]] = ttg.convert_layout [[ARG_1]] {{.*}}: tensor<128x128xf32, [[BLOCKED_1]]> -> tensor<128x128xf32, [[BLOCKED_2]]>
+// CHECK: [[CONVERT_1:%[0-9]+]] = ttg.convert_layout [[INTERMEDIATE_CONV]] {{.*}}: tensor<128x128xf32, [[BLOCKED_2]]> -> tensor<128x128xf32, [[MMA_2]]>
+// CHECK: [[CONVERT_2:%[0-9]+]] = ttg.convert_layout [[ARG_2]] {{.*}}: tensor<256x128xf16, [[MMA_1]]> -> tensor<256x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_1]], kWidth = 4}>>
 // CHECK: [[LOAD:%[0-9]+]] = ttg.local_load [[ALLOC]] : !ttg.memdesc<128x128xf16, [[SHARED]], #smem> -> tensor<128x128xf16, #ttg.dot_op<{opIdx = 0, parent = [[MMA_2]], kWidth = 4}>>
 #blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma1 = #ttg.amd_mfma<{version = 2, warpsPerCTA = [1, 8], instrShape = [32, 32], isTransposed = false}>
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -303,7 +303,7 @@ def make_llir(src, metadata, options):
         passes.convert.add_scf_to_cf(pm)
         passes.convert.add_index_to_llvmir(pm)
 
-        passes.ttgpuir.add_allocate_shared_memory(pm)
+        amd.passes.ttgpuir.add_allocate_shared_memory(pm)
         ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
         ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
         ##    of the value of kernel arg `allow_flush_denorm`.
diff --git a/third_party/amd/include/Analysis/AMDGPUAllocation.h b/third_party/amd/include/Analysis/AMDGPUAllocation.h
@@ -0,0 +1,19 @@
+#ifndef TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
+#define TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+
+namespace mlir::triton::AMD {
+
+constexpr char AttrSharedMemPadded[] = "amdgpu.use_padded_scratch_shmem";
+
+unsigned getConvertLayoutScratchInBytes(RankedTensorType srcTy,
+                                        RankedTensorType dstTy,
+                                        bool usePadding);
+
+unsigned AMDAllocationAnalysisScratchSizeFn(Operation *op);
+
+} // namespace mlir::triton::AMD
+
+#endif // TRITONAMD_ANALYSIS_AMDGPU_ALLOCATION_H
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
@@ -15,6 +15,17 @@ def OptimizeAMDLDSUsage : Pass<"optimize-amd-lds-usage", "mlir::ModuleOp"> {
     ];
 }
 
+def AllocateAMDGPUSharedMemory : Pass<"allocate-amdgpu-shared-memory", "mlir::ModuleOp"> {
+  let summary = "Add metadata for shared memory allocation";
+
+  let description = [{
+    This pass uses the `ModuleAllocation` analysis to:
+      - Annotate modules with an attribute with the amount of shared/local
+        memory used.
+      - Annotate operations with an offset into the total shared/local memory.
+  }];
+}
+
 def ConvertTritonAMDGPUToLLVM : Pass<"convert-triton-amdgpu-to-llvm", "mlir::ModuleOp"> {
     let summary = "Convert TritonGPU to LLVM";
     let constructor = "mlir::triton::createConvertTritonAMDGPUToLLVMPass(\"\", /*ftz=*/true)";
diff --git a/third_party/amd/lib/Analysis/AMDGPUAllocation.cpp b/third_party/amd/lib/Analysis/AMDGPUAllocation.cpp
@@ -0,0 +1,44 @@
+#include "Analysis/AMDGPUAllocation.h"
+#include "triton/Analysis/Allocation.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+namespace mlir::triton::AMD {
+
+constexpr int globalPtrBitWidth = 64;
+
+static unsigned getBitwidth(RankedTensorType ty) {
+  auto isPtr = isa<PointerType>(ty.getElementType());
+  return isPtr ? globalPtrBitWidth : std::max(ty.getElementTypeBitWidth(), 8u);
+}
+
+unsigned getConvertLayoutScratchInBytes(RankedTensorType srcTy,
+                                        RankedTensorType dstTy,
+                                        bool usePadding) {
+  if (!cvtNeedsSharedMemory(srcTy, dstTy))
+    return 0;
+  unsigned elems = 0;
+  if (usePadding) {
+    auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+    elems = getNumScratchElements(scratchConfig.paddedRepShape);
+  } else {
+    assert(false && "General swizzling for convert layout is not suported in "
+                    "AMD backend yet");
+    // TODO use swizzling
+  }
+  return elems * getBitwidth(srcTy) / 8;
+}
+
+unsigned AMDAllocationAnalysisScratchSizeFn(Operation *op) {
+  if (op->hasAttr(AttrSharedMemPadded)) {
+    if (auto cvtLayout = dyn_cast<mlir::triton::gpu::ConvertLayoutOp>(op)) {
+      auto srcTy = cvtLayout.getSrc().getType();
+      auto dstTy = cvtLayout.getType();
+      return getConvertLayoutScratchInBytes(srcTy, dstTy,
+                                            op->hasAttr(AttrSharedMemPadded));
+    }
+  }
+  return defaultAllocationAnalysisScratchSizeFn(op);
+}
+
+} // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/Analysis/CMakeLists.txt b/third_party/amd/lib/Analysis/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_triton_library(TritonAMDAnalysis
   RangeAnalysis.cpp
   AxisInfoExt.cpp
+  AMDGPUAllocation.cpp
 
   DEPENDS
   TritonTableGen
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AllocateSharedMemory.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AllocateSharedMemory.cpp
@@ -0,0 +1,29 @@
+#include "Analysis/AMDGPUAllocation.h"
+#include "TritonAMDGPUToLLVM/Passes.h"
+#include "triton/Analysis/Allocation.h"
+#include "triton/Analysis/Utility.h"
+#include "triton/Conversion/TritonGPUToLLVM/AllocateSharedMemoryUtility.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+using namespace mlir::triton::AMD;
+
+namespace mlir::triton {
+#define GEN_PASS_DEF_ALLOCATEAMDGPUSHAREDMEMORY
+#include "TritonAMDGPUToLLVM/Passes.h.inc"
+} // namespace mlir::triton
+
+namespace {
+
+struct AllocateAMDGPUSharedMemory
+    : public mlir::triton::impl::AllocateAMDGPUSharedMemoryBase<
+          AllocateAMDGPUSharedMemory> {
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+    ModuleAllocation allocation(mod, AMDAllocationAnalysisScratchSizeFn);
+
+    mlir::triton::gpu::attachAllocationSizeAndOffsetAttr(mod, allocation);
+  }
+};
+
+} // namespace
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_triton_library(TritonAMDGPUToLLVM
     AsyncUtility.cpp
     AtomicRMWOpsEmitter.cpp
+    AllocateSharedMemory.cpp
     BufferOpsEmitter.cpp
     ConvertLayoutOpToLLVM/SharedToDotOperandHelper.cpp
     ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUsage.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUsage.cpp
@@ -20,6 +20,7 @@
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
+#include "Analysis/AMDGPUAllocation.h"
 #include "OptimizeLDSUtility.h"
 #include "TargetInfo.h"
 #include "TritonAMDGPUToLLVM/Passes.h"
@@ -243,7 +244,15 @@ class OptimizeAMDLDSUsage
       LDSLimit = targetInfo.getSharedMemorySize();
     }
 
-    ModuleAllocation allocAnalysis(mod);
+    auto context = mod.getContext();
+    auto emptyAttribute = UnitAttr::get(context);
+    // TODO choose between padded and swizzled memory patterns
+    mod.walk([emptyAttribute](triton::gpu::ConvertLayoutOp op) -> void {
+      op->setAttr(mlir::triton::AMD::AttrSharedMemPadded, emptyAttribute);
+    });
+
+    ModuleAllocation allocAnalysis(
+        mod, mlir::triton::AMD::AMDAllocationAnalysisScratchSizeFn);
     if (allocAnalysis.getSharedMemorySize() <= LDSLimit)
       return;
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/OptimizeLDSUtility.h
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc

Original file line number	Diff line number	Diff line change
`@@ -382,4 +382,5 @@ LinearLayout optimalSwizzling(const LinearLayout &src, const LinearLayout &dst,`
`382`	`382`
`383`	`383`	`return basis1D.reshapeOuts(src.getOutDims());`
`384`	`384`	`}`
	`385`	`+`
`385`	`386`	`} // namespace mlir::triton::gpu`