Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions flang/lib/Lower/Support/PrivateReductionUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper {
loadedMoldArg = builder.loadIfRef(loc, moldArg);
return loadedMoldArg;
}

bool shouldAllocateTempOnStack() const;
};

} // namespace
Expand Down Expand Up @@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
}

mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{},
/*shape=*/{}, lenParams);
bool shouldAllocateOnStack = shouldAllocateTempOnStack();
mlir::Value valAlloc =
(shouldAllocateOnStack)
? builder.createTemporary(loc, innerTy, /*name=*/{},
/*shape=*/{}, lenParams)
: builder.createHeapTemporary(loc, innerTy, /*name=*/{},
/*shape=*/{}, lenParams);

if (scalarInitValue)
builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc,
Expand All @@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
fir::StoreOp lastOp =
fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);

createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
isDoConcurrent);
if (!shouldAllocateOnStack)
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
isDoConcurrent);

if (ifUnallocated)
builder.setInsertionPointAfter(ifUnallocated);
Expand All @@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
createYield(allocatedPrivVarArg);
}

bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
// On the GPU, always allocate on the stack since heap allocatins are very
// expensive.
auto offloadMod =
llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
return offloadMod && offloadMod.getIsGPU();
}

void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
fir::BaseBoxType boxTy, bool needsInitialization) {
bool isAllocatableOrPointer =
Expand Down Expand Up @@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
// Allocating on the heap in case the whole reduction/privatization is nested
// inside of a loop
auto temp = [&]() {
bool shouldAllocateOnStack = false;

// On the GPU, always allocate on the stack since heap allocatins are very
// expensive.
if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
*builder.getModule()))
shouldAllocateOnStack = offloadMod.getIsGPU();

if (shouldAllocateOnStack)
if (shouldAllocateTempOnStack())
return createStackTempFromMold(loc, builder, source);

auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
! Tests delayed privatization for `targets ... private(..)` for allocatables.

! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
! RUN: -o - %s 2>&1 | FileCheck %s
! RUN: -o - %s 2>&1 | FileCheck %s --check-prefix=CPU

! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \
! RUN: | FileCheck %s
! RUN: | FileCheck %s --check-prefix=CPU

! RUN: %if amdgpu-registered-target %{ \
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \
! RUN: -fopenmp -fopenmp-is-target-device \
! RUN: -mmlir --enable-delayed-privatization-staging \
! RUN: -o - %s 2>&1 | \
! RUN: FileCheck %s --check-prefix=GPU \
! RUN: %}

! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \
! RUN: -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \
! RUN: | FileCheck %s --check-prefix=GPU

subroutine target_allocatable
implicit none
Expand All @@ -14,53 +27,65 @@ subroutine target_allocatable
!$omp end target
end subroutine target_allocatable

! CHECK-LABEL: omp.private {type = private}
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] :
! CHECK-SAME: [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
! CPU-LABEL: omp.private {type = private}
! CPU-SAME: @[[VAR_PRIVATIZER_SYM:.*]] :
! CPU-SAME: [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
! CPU: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):

! CPU-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
! CPU-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
! CPU-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CPU-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64

! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
! CPU-NEXT: fir.if %[[ALLOC_COND]] {
! CPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
! CPU-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
! CPU-NEXT: } else {
! CPU-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
! CPU-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
! CPU-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
! CPU-NEXT: }

! CHECK-NEXT: fir.if %[[ALLOC_COND]] {
! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
! CHECK-NEXT: } else {
! CHECK-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
! CHECK-NEXT: }
! CPU-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])

! CHECK-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
! CPU-NEXT: } dealloc {
! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):

! CHECK-NEXT: } dealloc {
! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
! CPU-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
! CPU-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
! CPU-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CPU-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64

! CHECK-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
! CHECK-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
! CHECK-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
! CHECK-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
! CPU-NEXT: fir.if %[[PRIV_NULL_COND]] {
! CPU-NEXT: fir.freemem %[[PRIV_ADDR]]
! CPU-NEXT: }

! CHECK-NEXT: fir.if %[[PRIV_NULL_COND]] {
! CHECK-NEXT: fir.freemem %[[PRIV_ADDR]]
! CHECK-NEXT: }
! CPU-NEXT: omp.yield
! CPU-NEXT: }

! CHECK-NEXT: omp.yield
! CHECK-NEXT: }

! CPU-LABEL: func.func @_QPtarget_allocatable() {

! CHECK-LABEL: func.func @_QPtarget_allocatable() {
! CPU: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
! CPU-SAME: {bindc_name = "alloc_var", {{.*}}}
! CPU: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
! CPU: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
! CPU: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>

! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}}
! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
! CHECK: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
! CHECK: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
! CHECK: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
! CPU: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {

! CHECK: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
! GPU-LABEL: omp.private {type = private} {{.*}} init {
! GPU: fir.if %{{.*}} {
! GPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %{{.*}}
! GPU-NEXT: fir.store %[[ZERO_BOX]] to %{{.*}}
! GPU-NEXT: } else {
! GPU-NOT: fir.allocmem i32
! GPU-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca i32
! GPU-NEXT: %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]]
! GPU-NEXT: fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}}
! GPU-NEXT: }
! GPU-NEXT: omp.yield(%{{.*}})