intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 0 additions & 3 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/intel/sub-group-transpose.mlir‎
Lines changed: 38 additions & 19 deletions b/‎test/Conversion/intel/sub-group-transpose.mlir‎
Lines changed: 38 additions & 19 deletions
diff --git a/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 7 additions & 2 deletions b/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎test/TritonIntelGPU/tritonintelgpu-rewrite-stack-ptr.mlir‎
Lines changed: 64 additions & 0 deletions b/‎test/TritonIntelGPU/tritonintelgpu-rewrite-stack-ptr.mlir‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp‎
Lines changed: 0 additions & 13 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h‎
Lines changed: 0 additions & 3 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎third_party/intel/backend/compiler.py‎
Lines changed: 2 additions & 1 deletion b/‎third_party/intel/backend/compiler.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td‎
Lines changed: 15 additions & 0 deletions b/‎third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td‎
Lines changed: 15 additions & 0 deletions
@@ -98,9 +98,6 @@ class TargetInfoBase {
   virtual void storeOpAnnotation(triton::gpu::LocalStoreOp op,
                                  size_t localStoreOpCount, Type type) const {}
 
-  virtual Value getStackPointer(RewriterBase &rewriter,
-                                FunctionOpInterface funcOp) const = 0;
-
   virtual ~TargetInfoBase() {}
 };
 } // namespace mlir::triton
 
@@ -583,7 +583,7 @@ inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   Value offVal = b.i32_val(offset);
   Value base =
-      b.gep(ptrTy, i8_ty, target.getStackPointer(rewriter, func), offVal);
+      b.gep(ptrTy, i8_ty, LLVM::getStackPointer(rewriter, func), offVal);
   return base;
 }
 
 
@@ -85,7 +85,7 @@ struct CallOpConversion : public ConvertOpToLLVMPattern<triton::CallOp> {
         callOp.getLoc(), /*opOperands=*/callOp->getOperands(),
         adaptor.getOperands(), rewriter);
     if (!caller->hasAttr("allocation.offset")) {
-      auto base = targetInfo.getStackPointer(rewriter, caller);
+      auto base = LLVM::getStackPointer(rewriter, caller);
       promotedOperands.push_back(base);
     } else {
       auto base = LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, callOp);
 
@@ -603,6 +603,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_alloc_tensor(%arg0: !llvm.ptr<3>)
   tt.func @basic_alloc_tensor() {
     // CHECK-NEXT: llvm.mlir.constant
+    // CHECK-NEXT: llvm.mlir.addressof @global_smem
     // CHECK-NEXT: llvm.getelementptr
     %0 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #shared0, #smem, mutable>
     tt.return
@@ -1102,7 +1103,9 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
     // CHECK-NEXT:   llvm.br ^bb2([[CMPXCHG_RES]] : i32)
     // CHECK-NEXT: ^bb2([[RES:%.*]]: i32):
     // CHECK-NEXT:   [[RES_CAST:%.*]] = llvm.bitcast [[RES]] : i32 to f32
-    // CHECK:        [[GEP:%.*]] = llvm.getelementptr %arg3[{{.*}}] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8
+    // CHECK:        [[C_0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:        [[SMEM_0:%.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
+    // CHECK:        [[GEP:%.*]] = llvm.getelementptr [[SMEM_0]]{{\[}}[[C_0]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8
     // CHECK-NEXT:   [[GEP_CAST:%.*]] = llvm.bitcast [[GEP]] : !llvm.ptr<3> to !llvm.ptr<3>
     // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb3, ^bb4
     // CHECK-NEXT: ^bb3:
@@ -1210,7 +1213,9 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
     // CHECK-NEXT:   llvm.br ^bb2([[RMW_RES]] : f32)
     // CHECK-NEXT: ^bb2([[RMW_PHI:%.*]]: f32):
     // CHECK-NEXT:   [[RMW_CAST:%.*]] = llvm.bitcast [[RMW_PHI]] : f32 to f32
-    // CHECK:        [[GEP:%.*]] = llvm.getelementptr %arg3[{{.*}}] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8
+    // CHECK:        [[C_0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:        [[SMEM_0:%.*]] = llvm.mlir.addressof @global_smem : !llvm.ptr<3>
+    // CHECK:        [[GEP:%.*]] = llvm.getelementptr [[SMEM_0]]{{\[}}[[C_0]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8
     // CHECK-NEXT:   [[GEP_CAST:%.*]] = llvm.bitcast [[GEP]] : !llvm.ptr<3> to !llvm.ptr<3>
     // CHECK-NEXT:   llvm.cond_br [[PRED]], ^bb3, ^bb4
     // CHECK-NEXT:  ^bb3:
 
@@ -0,0 +1,64 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --tritonintelgpu-rewrite-stack-ptr | FileCheck %s
+
+module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL:   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
+  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>)
+  tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = tt.load %arg0 : !tt.ptr<f32>
+    %1 = tt.load %arg1 : !tt.ptr<f32>
+    // CHECK: llvm.mlir.poison : !llvm.ptr<3>
+    // CHECK: llvm.call @noinline_simple_fn__fp32_fp32_Pfp32__(%8, %17, %arg2, %18, %arg2)
+    tt.call @noinline_simple_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) : (f32, f32, !tt.ptr<f32>) -> ()
+    tt.return
+  }
+  // CHECK:   llvm.func internal @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>)
+  tt.func private @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64})  attributes {noinline = true} {
+    %0 = arith.addf %arg0, %arg1 fastmath<fast> : f32
+    tt.store %arg2, %0 : !tt.ptr<f32>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 4], warpsPerCTA = [1, 1], order = [1, 0]}>
+#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 1], A = [16, 8], B = [8, 16], C = [16, 16]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 1280 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL:   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
+  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>)
+  tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = tt.load %arg0 : !tt.ptr<f32>
+    %1 = tt.load %arg1 : !tt.ptr<f32>
+    // CHECK: llvm.call @noinline_shared_fn__fp32_fp32_Pfp32__(%8, %17, %arg2, %arg3, %arg2)
+    tt.call @noinline_shared_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) {allocation.offset = 0 : i32} : (f32, f32, !tt.ptr<f32>) -> ()
+    tt.return
+  }
+  // CHECK: llvm.func internal @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>)
+  // CHECK: llvm.getelementptr %arg3[{{.*}}]
+  tt.func private @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64}) attributes {noinline = true} {
+    %cst = arith.constant dense<16> : tensor<16x1xi32, #blocked>
+    %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked>
+    %2 = arith.muli %1, %cst : tensor<16x1xi32, #blocked>
+    %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %4 = tt.expand_dims %3 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked>
+    %5 = tt.broadcast %2 : tensor<16x1xi32, #blocked> -> tensor<16x16xi32, #blocked>
+    %6 = tt.broadcast %4 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked>
+    %7 = arith.addi %5, %6 : tensor<16x16xi32, #blocked>
+    %8 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<16x16x!tt.ptr<f32>, #blocked>
+    %9 = tt.addptr %8, %7 : tensor<16x16x!tt.ptr<f32>, #blocked>, tensor<16x16xi32, #blocked>
+    %10 = tt.load %9 : tensor<16x16x!tt.ptr<f32>, #blocked>
+    %11 = ttg.local_alloc %10 {allocation.offset = 0 : i32} : (tensor<16x16xf32, #blocked>) -> !ttg.memdesc<16x16xf32, #shared, #smem>
+    %12 = tt.splat %arg0 : f32 -> tensor<16x16xf32, #mma>
+    %13 = ttg.local_load %11 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    %14 = ttg.local_load %11 : !ttg.memdesc<16x16xf32, #shared, #smem> -> tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
+    %15 = tt.dot %13, %14, %12, inputPrecision = tf32 : tensor<16x16xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma>
+    %16 = tt.splat %arg1 : f32 -> tensor<16x16xf32, #mma>
+    %17 = arith.addf %15, %16 fastmath<fast> : tensor<16x16xf32, #mma>
+    %18 = ttg.convert_layout %17 {allocation.offset = 0 : i32} : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked>
+    tt.store %9, %18 : tensor<16x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
@@ -447,19 +447,6 @@ void TargetInfo::assertFail(RewriterBase &rewriter, Location loc,
 
 int TargetInfo::getSharedAddressSpace() const { return 3; }
 
-Value TargetInfo::getStackPointer(RewriterBase &rewriter,
-                                  FunctionOpInterface funcOp) const {
-  // See NOTE: [Additional Function Arguments]
-  if (!LLVM::isKernel(funcOp)) {
-    return funcOp.getArgument(funcOp.getNumArguments() - 2);
-  }
-
-  auto mod = funcOp->getParentOfType<ModuleOp>();
-  auto globalBase = dyn_cast<LLVM::GlobalOp>(mod.lookupSymbol("global_smem"));
-  assert(globalBase);
-  return rewriter.create<LLVM::AddressOfOp>(funcOp.getLoc(), globalBase);
-}
-
 int TargetInfo::getAddressSpace(Attribute addressSpace) const {
   int spaceId = 0;
   if (isa<triton::gpu::SharedMemorySpaceAttr>(addressSpace)) {
 
@@ -75,9 +75,6 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
   void storeOpAnnotation(triton::gpu::LocalStoreOp op, size_t localStoreOpCount,
                          Type type) const override;
 
-  Value getStackPointer(RewriterBase &rewriter,
-                        FunctionOpInterface funcOp) const override;
-
 private:
   void printfImpl(Value formatStrStart, int formatStrByteCount, ValueRange args,
                   RewriterBase &rewriter, bool useStdErr) const;
 
@@ -328,8 +328,9 @@ def make_llir(src, metadata, options):
         # solutions for SLM allocation, so this will crash on some operations
         # being used, e.g., convert_layout.
         if os.getenv("TRITON_INTEL_REDUCE_TRANSPOSE", "0") != "1":
-            intel.passes.ttgpuir.add_allocate_shared_memory(pm)
+            passes.ttgpuir.add_allocate_shared_memory(pm)
         intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt)
+        intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
         intel.set_fast_math(mod)
         passes.convert.add_arith_to_llvmir(pm)
         passes.common.add_canonicalizer(pm)
 
@@ -362,4 +362,19 @@ tt.func @test(%arg0: tensor<16x32xf32, #mma>) -> tensor<16xf32, #ttg.slice<{dim
                            "mlir::triton::gpu::TritonGPUDialect"];
 }
 
+def TritonIntelGPURewriteStackPtr
+    : Pass<"tritonintelgpu-rewrite-stack-ptr", "mlir::ModuleOp"> {
+  let summary = "rewrite the getStackPointer for Intel by addressofOp replacement";
+
+  let description = [{
+    This pass searches for the global_smem symbol and replaces the addressOfOp with a newly inserted
+    SLM parameter or a PoisonOp to rewrite the getStackPointer for Intel.
+  }];
+
+  let dependentDialects = [
+    "mlir::triton::gpu::TritonGPUDialect",
+    "mlir::triton::gpu::intel::TritonIntelGPUDialect", "mlir::scf::SCFDialect",
+    "mlir::arith::ArithDialect"
+  ];
+}
 #endif // TRITON_INTEL_GPU_PASSES
Original file line number	Diff line number	Diff line change
`@@ -583,7 +583,7 @@ inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,`
`583`	`583`	`auto b = TritonLLVMOpBuilder(loc, rewriter);`
`584`	`584`	`Value offVal = b.i32_val(offset);`
`585`	`585`	`Value base =`
`586`		`- b.gep(ptrTy, i8_ty, target.getStackPointer(rewriter, func), offVal);`
	`586`	`+ b.gep(ptrTy, i8_ty, LLVM::getStackPointer(rewriter, func), offVal);`
`587`	`587`	`return base;`
`588`	`588`	`}`
`589`	`589`