Skip to content

Commit 585b6e2

Browse files
authored
[flang][OpenMP] Allocate allocatable init temps on the stack for GPUs (llvm#164761)
Temps needed for the allocatable reduction/privatization init regions are now allocated on the heap all the time. However, this is performance killer for GPUs since malloc calls are prohibitively expensive. Therefore, we should do these allocations on the stack for GPU reductions. This is similar to what we do for arrays. Additionally, I am working on getting reductions-by-ref to work on GPUs which is a bit of a challenge given the many involved steps (e.g. intra-warp and inter-warp reuctions, shuffling data from remote lanes, ...). But this is a prerequisite step.
1 parent 128214f commit 585b6e2

File tree

2 files changed

+88
-54
lines changed

2 files changed

+88
-54
lines changed

flang/lib/Lower/Support/PrivateReductionUtils.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ class PopulateInitAndCleanupRegionsHelper {
376376
loadedMoldArg = builder.loadIfRef(loc, moldArg);
377377
return loadedMoldArg;
378378
}
379+
380+
bool shouldAllocateTempOnStack() const;
379381
};
380382

381383
} // namespace
@@ -438,8 +440,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
438440
builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
439441
}
440442

441-
mlir::Value valAlloc = builder.createHeapTemporary(loc, innerTy, /*name=*/{},
442-
/*shape=*/{}, lenParams);
443+
bool shouldAllocateOnStack = shouldAllocateTempOnStack();
444+
mlir::Value valAlloc =
445+
(shouldAllocateOnStack)
446+
? builder.createTemporary(loc, innerTy, /*name=*/{},
447+
/*shape=*/{}, lenParams)
448+
: builder.createHeapTemporary(loc, innerTy, /*name=*/{},
449+
/*shape=*/{}, lenParams);
450+
443451
if (scalarInitValue)
444452
builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
445453
mlir::Value box = fir::EmboxOp::create(builder, loc, valType, valAlloc,
@@ -451,8 +459,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
451459
fir::StoreOp lastOp =
452460
fir::StoreOp::create(builder, loc, box, allocatedPrivVarArg);
453461

454-
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
455-
isDoConcurrent);
462+
if (!shouldAllocateOnStack)
463+
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
464+
isDoConcurrent);
456465

457466
if (ifUnallocated)
458467
builder.setInsertionPointAfter(ifUnallocated);
@@ -462,6 +471,14 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
462471
createYield(allocatedPrivVarArg);
463472
}
464473

474+
bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
475+
// On the GPU, always allocate on the stack since heap allocatins are very
476+
// expensive.
477+
auto offloadMod =
478+
llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
479+
return offloadMod && offloadMod.getIsGPU();
480+
}
481+
465482
void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
466483
fir::BaseBoxType boxTy, bool needsInitialization) {
467484
bool isAllocatableOrPointer =
@@ -504,15 +521,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
504521
// Allocating on the heap in case the whole reduction/privatization is nested
505522
// inside of a loop
506523
auto temp = [&]() {
507-
bool shouldAllocateOnStack = false;
508-
509-
// On the GPU, always allocate on the stack since heap allocatins are very
510-
// expensive.
511-
if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
512-
*builder.getModule()))
513-
shouldAllocateOnStack = offloadMod.getIsGPU();
514-
515-
if (shouldAllocateOnStack)
524+
if (shouldAllocateTempOnStack())
516525
return createStackTempFromMold(loc, builder, source);
517526

518527
auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
11
! Tests delayed privatization for `targets ... private(..)` for allocatables.
22

33
! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
4-
! RUN: -o - %s 2>&1 | FileCheck %s
4+
! RUN: -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
5+
56
! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging -o - %s 2>&1 \
6-
! RUN: | FileCheck %s
7+
! RUN: | FileCheck %s --check-prefix=CPU
8+
9+
! RUN: %if amdgpu-registered-target %{ \
10+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir \
11+
! RUN: -fopenmp -fopenmp-is-target-device \
12+
! RUN: -mmlir --enable-delayed-privatization-staging \
13+
! RUN: -o - %s 2>&1 | \
14+
! RUN: FileCheck %s --check-prefix=GPU \
15+
! RUN: %}
16+
17+
! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization-staging \
18+
! RUN: -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 \
19+
! RUN: | FileCheck %s --check-prefix=GPU
720

821
subroutine target_allocatable
922
implicit none
@@ -14,53 +27,65 @@ subroutine target_allocatable
1427
!$omp end target
1528
end subroutine target_allocatable
1629

17-
! CHECK-LABEL: omp.private {type = private}
18-
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM:.*]] :
19-
! CHECK-SAME: [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
20-
! CHECK: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
30+
! CPU-LABEL: omp.private {type = private}
31+
! CPU-SAME: @[[VAR_PRIVATIZER_SYM:.*]] :
32+
! CPU-SAME: [[DESC_TYPE:!fir.box<!fir.heap<i32>>]] init {
33+
! CPU: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]], %[[PRIV_ALLOC:.*]]: [[TYPE]]):
34+
35+
! CPU-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
36+
! CPU-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
37+
! CPU-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
38+
! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64
39+
! CPU-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
2140

22-
! CHECK-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : [[TYPE]]
23-
! CHECK-NEXT: %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : ([[DESC_TYPE]]) -> !fir.heap<i32>
24-
! CHECK-NEXT: %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
25-
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
26-
! CHECK-NEXT: %[[ALLOC_COND:.*]] = arith.cmpi eq, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
41+
! CPU-NEXT: fir.if %[[ALLOC_COND]] {
42+
! CPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
43+
! CPU-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
44+
! CPU-NEXT: } else {
45+
! CPU-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
46+
! CPU-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
47+
! CPU-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
48+
! CPU-NEXT: }
2749

28-
! CHECK-NEXT: fir.if %[[ALLOC_COND]] {
29-
! CHECK-NEXT: %[[ZERO_BOX:.*]] = fir.embox %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
30-
! CHECK-NEXT: fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
31-
! CHECK-NEXT: } else {
32-
! CHECK-NEXT: %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32
33-
! CHECK-NEXT: %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> [[DESC_TYPE]]
34-
! CHECK-NEXT: fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : [[TYPE]]
35-
! CHECK-NEXT: }
50+
! CPU-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
3651

37-
! CHECK-NEXT: omp.yield(%[[PRIV_ALLOC]] : [[TYPE]])
52+
! CPU-NEXT: } dealloc {
53+
! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
3854

39-
! CHECK-NEXT: } dealloc {
40-
! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
55+
! CPU-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
56+
! CPU-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
57+
! CPU-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
58+
! CPU-NEXT: %[[C0:.*]] = arith.constant 0 : i64
59+
! CPU-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
4160

42-
! CHECK-NEXT: %[[PRIV_VAL:.*]] = fir.load %[[PRIV_ARG]]
43-
! CHECK-NEXT: %[[PRIV_ADDR:.*]] = fir.box_addr %[[PRIV_VAL]]
44-
! CHECK-NEXT: %[[PRIV_ADDR_I64:.*]] = fir.convert %[[PRIV_ADDR]]
45-
! CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : i64
46-
! CHECK-NEXT: %[[PRIV_NULL_COND:.*]] = arith.cmpi ne, %[[PRIV_ADDR_I64]], %[[C0]] : i64
61+
! CPU-NEXT: fir.if %[[PRIV_NULL_COND]] {
62+
! CPU-NEXT: fir.freemem %[[PRIV_ADDR]]
63+
! CPU-NEXT: }
4764

48-
! CHECK-NEXT: fir.if %[[PRIV_NULL_COND]] {
49-
! CHECK-NEXT: fir.freemem %[[PRIV_ADDR]]
50-
! CHECK-NEXT: }
65+
! CPU-NEXT: omp.yield
66+
! CPU-NEXT: }
5167

52-
! CHECK-NEXT: omp.yield
53-
! CHECK-NEXT: }
5468

69+
! CPU-LABEL: func.func @_QPtarget_allocatable() {
5570

56-
! CHECK-LABEL: func.func @_QPtarget_allocatable() {
71+
! CPU: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
72+
! CPU-SAME: {bindc_name = "alloc_var", {{.*}}}
73+
! CPU: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
74+
! CPU: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
75+
! CPU: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
76+
! CPU: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
5777

58-
! CHECK: %[[VAR_ALLOC:.*]] = fir.alloca [[DESC_TYPE]]
59-
! CHECK-SAME: {bindc_name = "alloc_var", {{.*}}}
60-
! CHECK: %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_ALLOC]]
61-
! CHECK: %[[BASE_ADDR:.*]] = fir.box_offset %[[VAR_DECL]]#0 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> [[MEMBER_TYPE:.*]]
62-
! CHECK: %[[MEMBER:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR]] : [[MEMBER_TYPE:.*]]) -> {{.*}}
63-
! CHECK: %[[MAP_VAR:.*]] = omp.map.info var_ptr(%[[VAR_DECL]]#0 : [[TYPE]], [[DESC_TYPE]]) map_clauses(to) capture(ByRef) members(%[[MEMBER]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>>
78+
! CPU: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
79+
! CPU-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
6480

65-
! CHECK: omp.target map_entries(%[[MAP_VAR]] -> %arg0, %[[MEMBER]] -> %arg1 : [[TYPE]], [[MEMBER_TYPE]]) private(
66-
! CHECK-SAME: @[[VAR_PRIVATIZER_SYM]] %[[VAR_DECL]]#0 -> %{{.*}} [map_idx=0] : [[TYPE]]) {
81+
! GPU-LABEL: omp.private {type = private} {{.*}} init {
82+
! GPU: fir.if %{{.*}} {
83+
! GPU-NEXT: %[[ZERO_BOX:.*]] = fir.embox %{{.*}}
84+
! GPU-NEXT: fir.store %[[ZERO_BOX]] to %{{.*}}
85+
! GPU-NEXT: } else {
86+
! GPU-NOT: fir.allocmem i32
87+
! GPU-NEXT: %[[PRIV_ALLOC:.*]] = fir.alloca i32
88+
! GPU-NEXT: %[[PRIV_ALLOC_BOX:.*]] = fir.embox %[[PRIV_ALLOC]]
89+
! GPU-NEXT: fir.store %[[PRIV_ALLOC_BOX]] to %{{.*}}
90+
! GPU-NEXT: }
91+
! GPU-NEXT: omp.yield(%{{.*}})

0 commit comments

Comments
 (0)