Skip to content

Commit ca1833b

Browse files
authored
[mlir][OpenMP] cast address space of private variables (llvm#130301)
Fixes llvm#130159 The problem is that the alloca created for the private variable uses the default alloca address space in that module, but the function the pointer is being passed to expects a different address space, leading to a type missmatch in the function argument. I know nothing about how AMDGPU is supposed to work. I based this solution on code from createDeviceArgumentAccessor(). Please could somebody from AMD confirm this solution is appropriate.
1 parent c59713c commit ca1833b

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1452,13 +1452,23 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
14521452

14531453
llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
14541454

1455+
unsigned int allocaAS =
1456+
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
1457+
unsigned int defaultAS = moduleTranslation.getLLVMModule()
1458+
->getDataLayout()
1459+
.getProgramAddressSpace();
1460+
14551461
for (auto [privDecl, mlirPrivVar, blockArg] :
14561462
llvm::zip_equal(privateDecls, mlirPrivateVars, privateBlockArgs)) {
14571463
llvm::Type *llvmAllocType =
14581464
moduleTranslation.convertType(privDecl.getType());
14591465
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
14601466
llvm::Value *llvmPrivateVar = builder.CreateAlloca(
14611467
llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
1468+
if (allocaAS != defaultAS)
1469+
llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar,
1470+
builder.getPtrTy(defaultAS));
1471+
14621472
llvmPrivateVars.push_back(llvmPrivateVar);
14631473
}
14641474

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
// Regression tset for calling a function using pointer alloca'ed on
4+
// device for private variable
5+
6+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
7+
omp.private {type = private} @_QMmodFfailingEi_private_i32 : i32
8+
llvm.func @_QMotherProutine(%arg0: !llvm.ptr {fir.bindc_name = "i", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} {
9+
llvm.return
10+
}
11+
llvm.func @_QMmodPfailing(%arg0: !llvm.ptr {fir.bindc_name = "d", llvm.nocapture}) attributes {frame_pointer = #llvm.framePointerKind<all>, omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>, target_cpu = "gfx90a", target_features = #llvm.target_features<["+16-bit-insts", "+atomic-buffer-global-pk-add-f16-insts", "+atomic-fadd-rtn-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts", "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx8-insts", "+gfx9-insts", "+gfx90a-insts", "+gws", "+image-insts", "+mai-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>} {
12+
%0 = llvm.mlir.constant(1 : i64) : i64
13+
%1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5>
14+
%2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
15+
%3 = llvm.mlir.constant(1 : i64) : i64
16+
%4 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"}
17+
%5 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "d"}
18+
omp.target map_entries(%4 -> %arg1, %5 -> %arg2 : !llvm.ptr, !llvm.ptr) {
19+
%6 = llvm.mlir.constant(1 : i32) : i32
20+
omp.teams {
21+
22+
// CHECK: omp.par.entry:
23+
// CHECK: %[[TID_ADDR_LOCAL:.*]] = alloca i32, align 4, addrspace(5)
24+
// CHECK: %[[OMP_PRIVATE_ALLOC:omp\.private\.alloc]] = alloca i32, align 4, addrspace(5)
25+
// CHECK-NEXT: %[[CAST:.*]] = addrspacecast ptr addrspace(5) %[[OMP_PRIVATE_ALLOC]] to ptr
26+
27+
omp.parallel private(@_QMmodFfailingEi_private_i32 %arg1 -> %arg3 : !llvm.ptr) {
28+
%7 = llvm.load %arg2 : !llvm.ptr -> i32
29+
omp.distribute {
30+
omp.wsloop {
31+
omp.loop_nest (%arg4) : i32 = (%6) to (%7) inclusive step (%6) {
32+
llvm.store %arg4, %arg3 : i32, !llvm.ptr
33+
llvm.call @_QMotherProutine(%arg3) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> ()
34+
omp.yield
35+
}
36+
} {omp.composite}
37+
} {omp.composite}
38+
omp.terminator
39+
} {omp.composite}
40+
omp.terminator
41+
}
42+
omp.terminator
43+
}
44+
llvm.return
45+
}
46+
}

0 commit comments

Comments
 (0)