From 08185bdc9cd34b24ab35a2775048a162fa2fe55e Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Fri, 7 Mar 2025 15:46:35 +0000 Subject: [PATCH] [mlir][OpenMP] cast address space of private variables Fixes #130159 The problem is that the alloca created for the private variable uses the default alloca address space in that module, but the function the pointer is being passed to expects a different address space, leading to a type missmatch in the function argument. I know nothing about how AMDGPU is supposed to work. I based this solution on code from createDeviceArgumentAccessor(). Please could somebody from AMD confirm this solution is appropriate. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 10 ++++ .../Target/LLVMIR/omptarget-private-llvm.mlir | 46 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 32c7c501d03c3..842308807cf02 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1452,6 +1452,12 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); + unsigned int allocaAS = + moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace(); + unsigned int defaultAS = moduleTranslation.getLLVMModule() + ->getDataLayout() + .getProgramAddressSpace(); + for (auto [privDecl, mlirPrivVar, blockArg] : llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) { llvm::Type *llvmAllocType = @@ -1459,6 +1465,10 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); llvm::Value *llvmPrivateVar = builder.CreateAlloca( llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); + if (allocaAS != defaultAS) + llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar, + builder.getPtrTy(defaultAS)); + llvmPrivateVars.push_back(llvmPrivateVar); } diff --git a/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir new file mode 100644 index 0000000000000..a2500f3a579dd --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-private-llvm.mlir @@ -0,0 +1,46 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Regression tset for calling a function using pointer alloca'ed on +// device for private variable + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QMmodFfailingEi_private_i32 : i32 + llvm.func @_QMotherProutine(%arg0: !llvm.ptr {fir.bindc_name = "i", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind, omp.declare_target = #omp.declaretarget, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} { + llvm.return + } + llvm.func @_QMmodPfailing(%arg0: !llvm.ptr {fir.bindc_name = "d", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind, omp.declare_target = #omp.declaretarget, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} + %5 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "d"} + omp.target map_entries(%4 -> %arg1, %5 -> %arg2 : !llvm.ptr, !llvm.ptr) { + %6 = llvm.mlir.constant(1 : i32) : i32 + omp.teams { + +// CHECK: omp.par.entry: +// CHECK: %[[TID_ADDR_LOCAL:.*]] = alloca i32, align 4, addrspace(5) +// CHECK: %[[OMP_PRIVATE_ALLOC:omp\.private\.alloc]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: %[[CAST:.*]] = addrspacecast ptr addrspace(5) %[[OMP_PRIVATE_ALLOC]] to ptr + + omp.parallel private(@_QMmodFfailingEi_private_i32 %arg1 -> %arg3 : !llvm.ptr) { + %7 = llvm.load %arg2 : !llvm.ptr -> i32 + omp.distribute { + omp.wsloop { + omp.loop_nest (%arg4) : i32 = (%6) to (%7) inclusive step (%6) { + llvm.store %arg4, %arg3 : i32, !llvm.ptr + llvm.call @_QMotherProutine(%arg3) {fastmathFlags = #llvm.fastmath} : (!llvm.ptr) -> () + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + llvm.return + } +}