[BACKEND] Fix regression of convert layout lowering. (#4447)

chengjunlu · web-flow · commit f0dbc5d2c1c0 · 2025-06-05T23:37:29.000-04:00
Intel backend uses optimal implementation for convert layout which requires addition scratch spaces. Need to use Intel specific `AllocateSharedMemory` to support it. This PR reverts "[Intel] Remove `AllocateSharedMemory` pass (#3684)" commit f79d0fd, and changes Intel pipeline to reuse the Intel specific `AllocateSharedMemory` pass. Signed-off-by: Lu,Chengjun <chengjun.lu@intel.com>
diff --git a/test/Conversion/intel/dot_layout_offset.mlir b/test/Conversion/intel/dot_layout_offset.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
 
 #dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
diff --git a/test/Conversion/intel/dpas_to_block_layout_convert.mlir b/test/Conversion/intel/dpas_to_block_layout_convert.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s
 
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
diff --git a/test/Conversion/intel/intel-allocate-shared-memory.mlir b/test/Conversion/intel/intel-allocate-shared-memory.mlir
@@ -0,0 +1,99 @@
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check no scratch memory is allocated for sub-group shuffle-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 0 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  // CHECK: tt.func @test_sub_group_shuffle
+  // CHECK-NOT: llvm.ptr<3>
+  tt.func @test_sub_group_shuffle(%arg0: tensor<16xf16, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf16, #ttg.slice<{dim = 1, parent = #blocked1}>> {
+    %0 = ttg.convert_layout %arg0 : tensor<16xf16, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf16, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    tt.return %0 : tensor<16xf16, #ttg.slice<{dim = 1, parent = #blocked1}>>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [2, 1], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check no scratch memory is allocated for sub-group shuffle-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 0 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  // CHECK: tt.func @test_sub_group_shuffle
+  // CHECK-NOT: llvm.ptr<3>
+  tt.func @test_sub_group_shuffle(%arg0: tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked1}>> {
+    %0 = ttg.convert_layout %arg0 : tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked1}>>
+    tt.return %0 : tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked1}>>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 544 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
+    %0 = ttg.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
+    tt.return %0 : tensor<16x16xf16, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 1088 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
+    %0 = ttg.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
+    tt.return %0 : tensor<16x16xf32, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 2], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 34816 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
+    %0 = ttg.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>
+    tt.return %0 : tensor<128x64xf32, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [16, 1], warpsPerCTA = [2, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [16, 2], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: ttg.shared = 17408 : i32
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  tt.func @test_contiguous(%arg0: tensor<32x128xf32, #blocked>) -> tensor<32x128xf32, #blocked1> {
+    %0 = ttg.convert_layout %arg0 : tensor<32x128xf32, #blocked> -> tensor<32x128xf32, #blocked1>
+    tt.return %0 : tensor<32x128xf32, #blocked1>
+  }
+}
diff --git a/test/Conversion/intel/shared_to_dot_layout_convert.mlir b/test/Conversion/intel/shared_to_dot_layout_convert.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm -canonicalize | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm -canonicalize | FileCheck %s
 
 #blocked0 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 8], warpsPerCTA = [32, 1], order = [1, 0]}>
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
diff --git a/test/Conversion/intel/sub-group-shuffle.mlir b/test/Conversion/intel/sub-group-shuffle.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
 
 // Basic 16x16 shuffle test
 
diff --git a/test/Conversion/intel/sub-group-transpose.mlir b/test/Conversion/intel/sub-group-transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s
 
 // Basic 16x16 transpose test
 
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>)
diff --git a/test/Conversion/intel/tritongpu_to_gen_dot.mlir b/test/Conversion/intel/tritongpu_to_gen_dot.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --cse -canonicalize | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,NO-AGGRESSIVE-REUSE
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --cse -canonicalize | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,NO-AGGRESSIVE-REUSE
 // RUN: env TRITON_INTEL_AGGRESSIVE_DPAS_REUSE=1 triton-opt %s -split-input-file --allocate-shared-memory  --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --cse -canonicalize | FileCheck %s --implicit-check-not=llvm.inline_asm  --check-prefixes=CHECK,AGGRESSIVE-REUSE
 
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 1]}>
diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,LARGE-BLOCK-SIZE-TRANS-B
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm=one_matrix_per_load_for_bt=1 | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,SMALL-BLOCK-SIZE-TRANS-B
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,LARGE-BLOCK-SIZE-TRANS-B
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm=one_matrix_per_load_for_bt=1 | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,SMALL-BLOCK-SIZE-TRANS-B
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [1, 0]}>
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
diff --git a/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir b/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --cse  | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --cse  | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 // CHECK:   llvm.func spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv
 #mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
diff --git a/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir b/test/TritonIntelGPU/tritongpu_reduce_op_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 // COM: Tests reduction when threads_per_warp < num_warps.
 
diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
 
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [1, 2], A = [8, 16], B = [16, 32], C = [8, 32]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32, ttig.min_sg_size = 16 : i32, ttig.support_dpas, ttig.support_sg_2d_block} {
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -342,7 +342,7 @@ def make_llir(src, metadata, options):
         # solutions for SLM allocation, so this will crash on some operations
         # being used, e.g., convert_layout.
         if not knobs.intel.reduce_transpose:
-            passes.ttgpuir.add_allocate_shared_memory(pm)
+            intel.passes.ttgpuir.add_allocate_shared_memory(pm)
         passes.ttgpuir.add_allocate_global_scratch_memory(pm)
         intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt,
                                            options.enable_tile_load_linear_layout)
diff --git a/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td b/third_party/intel/include/TritonIntelGPUToLLVM/Passes.td
@@ -3,6 +3,12 @@
 
 include "mlir/Pass/PassBase.td"
 
+def IntelAllocateSharedMemory
+    : Pass<"intel-allocate-shared-memory", "mlir::ModuleOp"> {
+  let summary = "Add metadata for shared memory allocation";
+  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
+}
+
 def ConvertTritonIntelGPUToLLVM
     : Pass<"convert-triton-intel-gpu-to-llvm", "mlir::ModuleOp"> {
   let summary = "Convert TritonIntelGPU to LLVM";
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp
@@ -0,0 +1,49 @@
+#include "intel/include/Analysis/Allocation.h"
+#include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
+#include "intel/include/TritonIntelGPUToLLVM/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+
+using namespace mlir;
+using namespace mlir::triton;
+
+namespace mlir::triton::gpu::intel {
+#define GEN_PASS_DEF_INTELALLOCATESHAREDMEMORY
+#include "intel/include/TritonIntelGPUToLLVM/Passes.h.inc"
+} // namespace mlir::triton::gpu::intel
+
+namespace {
+struct AllocateSharedMemory
+    : public triton::gpu::intel::impl::IntelAllocateSharedMemoryBase<
+          AllocateSharedMemory> {
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+    MLIRContext *ctx = &getContext();
+    ModuleAllocation allocation(
+        mod, ::mlir::triton::intel::allocationAnalysisScratchSizeFn);
+
+    mod.walk<mlir::WalkOrder::PreOrder>([&](FunctionOpInterface funcOp) {
+      auto *funcAllocation = allocation.getFuncData(funcOp);
+      funcOp.walk([&](Operation *op) {
+        auto oBufferId = funcAllocation->getBufferId(op);
+        int offset = -1;
+        if (oBufferId != Allocation::InvalidBufferId)
+          offset = funcAllocation->getOffset(oBufferId);
+        else if (op->getNumResults() == 1) {
+          Value value = op->getResult(0);
+          auto vBufferId = funcAllocation->getBufferId(value);
+          if (vBufferId != Allocation::InvalidBufferId)
+            offset = funcAllocation->getOffset(vBufferId);
+        }
+        if (offset == -1)
+          return;
+        op->setAttr("allocation.offset",
+                    IntegerAttr::get(IntegerType::get(ctx, 32), offset));
+      });
+      return WalkResult::skip();
+    });
+    mod->setAttr("ttg.shared",
+                 mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 32),
+                                        allocation.getSharedMemorySize()));
+  }
+};
+} // namespace
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_triton_library(TritonIntelGPUToLLVM
+    AllocateSharedMemory.cpp
     ArithOpsToLLVM.cpp
     BF16Casts.cpp
     ControlFlowOpToLLVM.cpp
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -76,6 +76,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
   ADD_PASS_OPTION_WRAPPER_2("add_pipeline",
                             gpu::intel::createTritonIntelGPUPipeline, int,
                             enum gpu::intel::SplitBarrierScope);
+  ADD_PASS_WRAPPER_0("add_allocate_shared_memory",
+                     gpu::intel::createIntelAllocateSharedMemory);
   ADD_PASS_WRAPPER_0("add_remove_layout_conversions",
                      gpu::intel::createTritonIntelGPURemoveLayoutConversions);
   ADD_PASS_WRAPPER_0("add_coalesce", gpu::intel::createTritonIntelGPUCoalesce);

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm \| FileCheck %s`
	`1`	`+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm \| FileCheck %s`
`2`	`2`
`3`	`3`	`#dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>`
`4`	`4`	`#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>`