diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 172029196905d..0fa47d3f48a83 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1103,12 +1103,63 @@ struct DeferredStore { }; } // namespace +/// Check whether allocations for the given operation might potentially have to +/// be done in device shared memory. That means we're compiling for a offloading +/// target, the operation is an `omp::TargetOp` or nested inside of one and that +/// target region represents a Generic (non-SPMD) kernel. +/// +/// This represents a necessary but not sufficient set of conditions to use +/// device shared memory in place of regular allocas. For some variables, the +/// associated OpenMP construct or their uses might also need to be taken into +/// account. +static bool +mightAllocInDeviceSharedMemory(Operation &op, + const llvm::OpenMPIRBuilder &ompBuilder) { + if (!ompBuilder.Config.isTargetDevice()) + return false; + + auto targetOp = dyn_cast(op); + if (!targetOp) + targetOp = op.getParentOfType(); + + return targetOp && + targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) == + omp::TargetExecMode::generic; +} + +/// Check whether the entry block argument representing the private copy of a +/// variable in an OpenMP construct must be allocated in device shared memory, +/// based on what the uses of that copy are. +/// +/// This must only be called if a previous call to +/// \c mightAllocInDeviceSharedMemory has already returned \c true for the +/// operation that owns the specified block argument. +static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) { + Operation *parentOp = value.getOwner()->getParentOp(); + auto targetOp = dyn_cast(parentOp); + if (!targetOp) + targetOp = parentOp->getParentOfType(); + assert(targetOp && "expected a parent omp.target operation"); + + for (auto *user : value.getUsers()) { + if (auto parallelOp = dyn_cast(user)) { + if (llvm::is_contained(parallelOp.getReductionVars(), value)) + return true; + } else if (auto parallelOp = user->getParentOfType()) { + if (parentOp->isProperAncestor(parallelOp)) + return true; + } + } + + return false; +} + /// Allocate space for privatized reduction variables. /// `deferredStores` contains information to create store operations which needs /// to be inserted after all allocas template static LogicalResult -allocReductionVars(T loop, ArrayRef reductionArgs, +allocReductionVars(T op, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, @@ -1120,10 +1171,14 @@ allocReductionVars(T loop, ArrayRef reductionArgs, llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool useDeviceSharedMem = + isa(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder); + // delay creating stores until after all allocas - deferredStores.reserve(loop.getNumReductionVars()); + deferredStores.reserve(op.getNumReductionVars()); - for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) { + for (std::size_t i = 0; i < op.getNumReductionVars(); ++i) { Region &allocRegion = reductionDecls[i].getAllocRegion(); if (isByRefs[i]) { if (allocRegion.empty()) @@ -1132,7 +1187,7 @@ allocReductionVars(T loop, ArrayRef reductionArgs, SmallVector phis; if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc", builder, moduleTranslation, &phis))) - return loop.emitError( + return op.emitError( "failed to inline `alloc` region of `omp.declare_reduction`"); assert(phis.size() == 1 && "expected one allocation to be yielded"); @@ -1140,33 +1195,43 @@ allocReductionVars(T loop, ArrayRef reductionArgs, // Allocate reduction variable (which is a pointer to the real reduction // variable allocated in the inlined region) - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - llvm::Type *ptrTy = builder.getPtrTy(); - llvm::Value *castVar = - builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + llvm::Value *var; + if (useDeviceSharedMem) { + var = ompBuilder->createOMPAllocShared(builder, varTy); + } else { + var = builder.CreateAlloca(varTy); + var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + } + llvm::Value *castPhi = builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy); - deferredStores.emplace_back(castPhi, castVar); + deferredStores.emplace_back(castPhi, var); - privateReductionVariables[i] = castVar; + privateReductionVariables[i] = var; moduleTranslation.mapValue(reductionArgs[i], castPhi); - reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi); + reductionVariableMap.try_emplace(op.getReductionVars()[i], castPhi); } else { assert(allocRegion.empty() && "allocaction is implicit for by-val reduction"); - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); llvm::Type *ptrTy = builder.getPtrTy(); - llvm::Value *castVar = - builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + llvm::Value *var; + if (useDeviceSharedMem) { + var = ompBuilder->createOMPAllocShared(builder, varTy); + } else { + var = builder.CreateAlloca(varTy); + var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + } - moduleTranslation.mapValue(reductionArgs[i], castVar); - privateReductionVariables[i] = castVar; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar); + moduleTranslation.mapValue(reductionArgs[i], var); + privateReductionVariables[i] = var; + reductionVariableMap.try_emplace(op.getReductionVars()[i], var); } } @@ -1228,6 +1293,10 @@ initReductionVars(OP op, ArrayRef reductionArgs, if (op.getNumReductionVars() == 0) return success(); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool useDeviceSharedMem = + isa(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder); + llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init"); auto allocaIP = llvm::IRBuilderBase::InsertPoint( latestAllocaBlock, latestAllocaBlock->getTerminator()->getIterator()); @@ -1242,8 +1311,12 @@ initReductionVars(OP op, ArrayRef reductionArgs, // TODO: remove after all users of by-ref are updated to use the alloc // region: Allocate reduction variable (which is a pointer to the real // reduciton variable allocated in the inlined region) - byRefVars[i] = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + if (useDeviceSharedMem) + byRefVars[i] = ompBuilder->createOMPAllocShared(builder, varTy); + else + byRefVars[i] = builder.CreateAlloca(varTy); } } @@ -1439,10 +1512,20 @@ static LogicalResult createReductionsAndCleanup( [](omp::DeclareReductionOp reductionDecl) { return &reductionDecl.getCleanupRegion(); }); - return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables, - moduleTranslation, builder, - "omp.reduction.cleanup"); - return success(); + LogicalResult result = inlineOmpRegionCleanup( + reductionRegions, privateReductionVariables, moduleTranslation, builder, + "omp.reduction.cleanup"); + + bool useDeviceSharedMem = + isa(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder); + if (useDeviceSharedMem) { + for (auto [var, reductionDecl] : + llvm::zip_equal(privateReductionVariables, reductionDecls)) + ompBuilder->createOMPFreeShared( + builder, var, moduleTranslation.convertType(reductionDecl.getType())); + } + + return result; } static ArrayRef getIsByRef(std::optional> attr) { @@ -1587,8 +1670,9 @@ initPrivateVars(llvm::IRBuilderBase &builder, /// Allocate and initialize delayed private variables. Returns the basic block /// which comes after all of these allocations. llvm::Value * for each of these /// private variables are populated in llvmPrivateVars. +template static llvm::Expected -allocatePrivateVars(llvm::IRBuilderBase &builder, +allocatePrivateVars(T op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, PrivateVarsInfo &privateVarsInfo, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, @@ -1611,6 +1695,10 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::DataLayout dataLayout = builder.GetInsertBlock()->getDataLayout(); llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool mightUseDeviceSharedMem = + isa(*op) && + mightAllocInDeviceSharedMemory(*op, *ompBuilder); unsigned int allocaAS = moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace(); unsigned int defaultAS = moduleTranslation.getLLVMModule() @@ -1623,11 +1711,17 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::Type *llvmAllocType = moduleTranslation.convertType(privDecl.getType()); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); - llvm::Value *llvmPrivateVar = builder.CreateAlloca( - llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); - if (allocaAS != defaultAS) - llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar, - builder.getPtrTy(defaultAS)); + llvm::Value *llvmPrivateVar = nullptr; + if (mightUseDeviceSharedMem && + mustAllocPrivateVarInDeviceSharedMemory(blockArg)) { + llvmPrivateVar = ompBuilder->createOMPAllocShared(builder, llvmAllocType); + } else { + llvmPrivateVar = builder.CreateAlloca( + llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); + if (allocaAS != defaultAS) + llvmPrivateVar = builder.CreateAddrSpaceCast( + llvmPrivateVar, builder.getPtrTy(defaultAS)); + } privateVarsInfo.llvmVars.push_back(llvmPrivateVar); } @@ -1699,24 +1793,41 @@ static LogicalResult copyFirstPrivateVars( return success(); } +template static LogicalResult -cleanupPrivateVars(llvm::IRBuilderBase &builder, +cleanupPrivateVars(T op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, Location loc, - SmallVectorImpl &llvmPrivateVars, - SmallVectorImpl &privateDecls) { + PrivateVarsInfo &privateVarsInfo) { // private variable deallocation SmallVector privateCleanupRegions; - llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions), + llvm::transform(privateVarsInfo.privatizers, + std::back_inserter(privateCleanupRegions), [](omp::PrivateClauseOp privatizer) { return &privatizer.getDeallocRegion(); }); - if (failed(inlineOmpRegionCleanup( - privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder, - "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false))) + if (failed(inlineOmpRegionCleanup(privateCleanupRegions, + privateVarsInfo.llvmVars, moduleTranslation, + builder, "omp.private.dealloc", + /*shouldLoadCleanupRegionArg=*/false))) return mlir::emitError(loc, "failed to inline `dealloc` region of an " "`omp.private` op in"); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool mightUseDeviceSharedMem = + isa(*op) && + mightAllocInDeviceSharedMemory(*op, *ompBuilder); + for (auto [privDecl, llvmPrivVar, blockArg] : + llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars, + privateVarsInfo.blockArgs)) { + if (mightUseDeviceSharedMem && + mustAllocPrivateVarInDeviceSharedMemory(blockArg)) { + ompBuilder->createOMPFreeShared( + builder, llvmPrivVar, + moduleTranslation.convertType(privDecl.getType())); + } + } + return success(); } @@ -2383,9 +2494,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator()); - if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(taskOp, builder, moduleTranslation, + taskOp.getLoc(), privateVarsInfo))) return llvm::make_error(); // Free heap allocated task context structure at the end of the task. @@ -2502,7 +2612,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, wsloopOp.getNumReductionVars()); llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + wsloopOp, builder, moduleTranslation, privateVarsInfo, allocaIP); if (handleError(afterAllocas, opInst).failed()) return failure(); @@ -2644,9 +2754,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, /*isTeamsReduction=*/false))) return failure(); - return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers); + return cleanupPrivateVars(wsloopOp, builder, moduleTranslation, + wsloopOp.getLoc(), privateVarsInfo); } /// Converts the OpenMP parallel operation to LLVM IR. @@ -2673,7 +2782,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) -> llvm::Error { llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + opInst, builder, moduleTranslation, privateVarsInfo, allocaIP); if (handleError(afterAllocas, *opInst).failed()) return llvm::make_error(); @@ -2787,9 +2896,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, return llvm::createStringError( "failed to inline `cleanup` region of `omp.declare_reduction`"); - if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(opInst, builder, moduleTranslation, + opInst.getLoc(), privateVarsInfo))) return llvm::make_error(); builder.restoreIP(oldIP); @@ -2861,7 +2969,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, findAllocaInsertPoint(builder, moduleTranslation); llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + simdOp, builder, moduleTranslation, privateVarsInfo, allocaIP); if (handleError(afterAllocas, opInst).failed()) return failure(); @@ -2981,9 +3089,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, "omp.reduction.cleanup"))) return failure(); - return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers); + return cleanupPrivateVars(simdOp, builder, moduleTranslation, simdOp.getLoc(), + privateVarsInfo); } /// Converts an OpenMP loop nest into LLVM IR using OpenMPIRBuilder. @@ -4884,8 +4991,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(codeGenIP); PrivateVarsInfo privVarsInfo(distributeOp); - llvm::Expected afterAllocas = - allocatePrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP); + llvm::Expected afterAllocas = allocatePrivateVars( + distributeOp, builder, moduleTranslation, privVarsInfo, allocaIP); if (handleError(afterAllocas, opInst).failed()) return llvm::make_error(); @@ -4938,9 +5045,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, return wsloopIP.takeError(); } - if (failed(cleanupPrivateVars(builder, moduleTranslation, - distributeOp.getLoc(), privVarsInfo.llvmVars, - privVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(distributeOp, builder, moduleTranslation, + distributeOp.getLoc(), privVarsInfo))) return llvm::make_error(); return llvm::Error::success(); @@ -5675,8 +5781,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, PrivateVarsInfo privateVarsInfo(targetOp); llvm::Expected afterAllocas = - allocatePrivateVars(builder, moduleTranslation, privateVarsInfo, - allocaIP, &mappedPrivateVars); + allocatePrivateVars(targetOp, builder, moduleTranslation, + privateVarsInfo, allocaIP, &mappedPrivateVars); if (failed(handleError(afterAllocas, *targetOp))) return llvm::make_error(); diff --git a/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir new file mode 100644 index 0000000000000..f2063bc8b79b3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir @@ -0,0 +1,109 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// This test checks that, when compiling for an offloading target, device shared +// memory will be used in place of allocas for certain private variables. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @privatizer : i32 + omp.declare_reduction @reduction : i32 init { + ^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + llvm.func @main() { + %c0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %c0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.alloca %c0 x i32 {bindc_name = "y"} : (i64) -> !llvm.ptr<5> + %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr + %5 = llvm.alloca %c0 x i32 {bindc_name = "z"} : (i64) -> !llvm.ptr<5> + %6 = llvm.addrspacecast %5 : !llvm.ptr<5> to !llvm.ptr + %7 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} + %8 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "y"} + %9 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "z"} + omp.target map_entries(%7 -> %arg0, %8 -> %arg1, %9 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %11 = llvm.mlir.constant(10000 : i32) : i32 + %12 = llvm.mlir.constant(1 : i32) : i32 + omp.teams reduction(@reduction %arg0 -> %arg3 : !llvm.ptr) { + omp.distribute private(@privatizer %arg1 -> %arg4, @privatizer %arg2 -> %arg5 : !llvm.ptr, !llvm.ptr) { + omp.loop_nest (%arg6) : i32 = (%12) to (%11) inclusive step (%12) { + llvm.store %arg6, %arg4 : i32, !llvm.ptr + %13 = llvm.load %arg3 : !llvm.ptr -> i32 + %14 = llvm.add %13, %12 : i32 + llvm.store %14, %arg3 : i32, !llvm.ptr + omp.parallel reduction(@reduction %arg5 -> %arg7 : !llvm.ptr) { + %15 = llvm.load %arg4 : !llvm.ptr -> i32 + %16 = llvm.load %arg7 : !llvm.ptr -> i32 + %17 = llvm.add %15, %16 : i32 + llvm.store %17, %arg7 : i32, !llvm.ptr + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + // CHECK: call i32 @__kmpc_target_init + // CHECK: call void @[[OUTLINED_TARGET:__omp_offloading_[A-Za-z0-9_.]*]] + + // CHECK: define internal void @[[OUTLINED_TARGET]] + // CHECK: %[[X_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[GEP_X:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // CHECK-NEXT: store ptr %[[X_PRIV]], ptr addrspace(5) %[[GEP_X]] + // CHECK-NEXT: call void @[[OUTLINED_TEAMS:__omp_offloading_[A-Za-z0-9_.]*]](ptr %structArg.ascast) + + // CHECK: [[REDUCE_FINALIZE_BB:reduce\.finalize.*]]: + // CHECK-NEXT: %{{.*}} = call i32 @__kmpc_global_thread_num + // CHECK-NEXT: call void @__kmpc_barrier + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[X_PRIV]], i64 4) + + // CHECK: define internal void @[[OUTLINED_TEAMS]] + // CHECK: %[[Y_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[Z_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + + // %[[GEP_Y:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // store ptr %[[Y_PRIV]], ptr addrspace(5) %[[GEP_Y]], align 8 + // %[[GEP_Z:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // store ptr %[[Z_PRIV]], ptr addrspace(5) %[[GEP_Z]], align 8 + + // CHECK: call void @__kmpc_free_shared(ptr %[[Y_PRIV]], i64 4) + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[Z_PRIV]], i64 4) + // CHECK-NEXT: br label %[[EXIT_BB:.*]] + + // CHECK: [[EXIT_BB]]: + // CHECK-NEXT: ret void + + // Test that we don't misidentify a private `distribute` value as being + // located inside of a parallel region if that parallel region is not nested + // inside of `omp.distribute`. + omp.parallel { + %18 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} + omp.target map_entries(%18 -> %arg0 : !llvm.ptr) { + %19 = llvm.mlir.constant(10000 : i32) : i32 + %20 = llvm.mlir.constant(1 : i32) : i32 + omp.teams { + omp.distribute private(@privatizer %arg0 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%20) to (%19) inclusive step (%20) { + llvm.store %arg2, %arg1 : i32, !llvm.ptr + omp.yield + } + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + // CHECK: call i32 @__kmpc_target_init + // CHECK-NOT: call {{.*}} @__kmpc_alloc_shared + // CHECK-NOT: call {{.*}} @__kmpc_free_shared + + llvm.return + } +}