Skip to content

Commit cda1425

Browse files
committed
[flang][OpenMP] Allocate reduction init temps on the stack for GPUs (llvm#146667)
Temps needed for the reduction init regions are now allocate on the heap all the time. However, this is performance killer for GPUs since malloc calls are prohibitively expensive. Therefore, we should do these allocations on the stack for GPU reductions.
1 parent 5d49f24 commit cda1425

File tree

3 files changed

+135
-104
lines changed

3 files changed

+135
-104
lines changed

flang/lib/Lower/Support/PrivateReductionUtils.cpp

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -503,31 +503,48 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
503503

504504
// TODO: Allocate on the heap if the whole reduction/privatization is nested
505505
// inside of a loop
506-
mlir::Value tempValue;
507-
std::optional<int64_t> cstNeedsDealloc;
508-
if (isAllocatableOrPointer) {
509-
auto [heapTemp, needsDealloc] = createTempFromMold(loc, builder, source);
510-
tempValue = heapTemp;
511-
cstNeedsDealloc = fir::getIntIfConstant(needsDealloc);
512-
} else {
513-
tempValue = hlfir::createStackTempFromMold(loc, builder, source);
514-
cstNeedsDealloc = false;
515-
}
516-
hlfir::Entity temp{tempValue};
517-
518-
// if needsDealloc isn't statically false, add cleanup region. Always
519-
// do this for allocatable boxes because they might have been re-allocated
520-
// in the body of the loop/parallel region
521-
assert(cstNeedsDealloc.has_value() &&
522-
"createTempFromMold decides this statically");
523-
if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
524-
mlir::OpBuilder::InsertionGuard guard(builder);
525-
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
526-
isDoConcurrent);
527-
} else {
528-
assert(!isAllocatableOrPointer &&
529-
"Pointer-like arrays must be heap allocated");
530-
}
506+
auto temp = [&]() {
507+
bool shouldAllocateOnStack = false;
508+
509+
// On the GPU, always allocate on the stack since heap allocatins are very
510+
// expensive.
511+
if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
512+
*builder.getModule()))
513+
shouldAllocateOnStack = offloadMod.getIsGPU();
514+
515+
if (shouldAllocateOnStack)
516+
return createStackTempFromMold(loc, builder, source);
517+
518+
mlir::Value tempValue;
519+
std::optional<int64_t> cstNeedsDealloc;
520+
if (isAllocatableOrPointer) {
521+
auto [heapTemp, needsDealloc] = createTempFromMold(loc, builder, source);
522+
tempValue = heapTemp;
523+
cstNeedsDealloc = fir::getIntIfConstant(needsDealloc);
524+
} else {
525+
tempValue = hlfir::createStackTempFromMold(loc, builder, source);
526+
cstNeedsDealloc = false;
527+
}
528+
hlfir::Entity temp{tempValue};
529+
530+
// if needsDealloc isn't statically false, add cleanup region. Always
531+
// do this for allocatable boxes because they might have been re-allocated
532+
// in the body of the loop/parallel region
533+
534+
std::optional<int64_t> cstNeedsDealloc =
535+
fir::getIntIfConstant(needsDealloc);
536+
assert(cstNeedsDealloc.has_value() &&
537+
"createTempFromMold decides this statically");
538+
if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
539+
mlir::OpBuilder::InsertionGuard guard(builder);
540+
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
541+
isDoConcurrent);
542+
} else {
543+
assert(!isAllocatableOrPointer &&
544+
"Pointer-like arrays must be heap allocated");
545+
}
546+
return temp;
547+
}();
531548

532549
// Put the temporary inside of a box:
533550
// hlfir::genVariableBox doesn't handle non-default lower bounds
Lines changed: 88 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
2-
! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
3-
! XFAIL: *
1+
! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
2+
! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
3+
4+
! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
5+
! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp -fopenmp-is-target-device -o - %s 2>&1 | FileCheck %s --check-prefix=GPU
46

57
program reduce
68
integer, dimension(3) :: i = 0
@@ -14,81 +16,88 @@ program reduce
1416
print *,i
1517
end program
1618

17-
! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
18-
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
19-
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
20-
! CHECK-LABEL: } init {
21-
! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
22-
! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32
23-
! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
24-
! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index
25-
! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
26-
! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
27-
! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
28-
! CHECK: %[[TRUE:.*]] = arith.constant true
19+
! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
20+
! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
21+
! CPU: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
22+
! CPU-LABEL: } init {
23+
! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[ALLOC:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
24+
! CPU: %[[VAL_2:.*]] = arith.constant 0 : i32
25+
! CPU: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
26+
! CPU: %[[VAL_4:.*]] = arith.constant 3 : index
27+
! CPU: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
28+
! CPU: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""}
29+
! CPU: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<3xi32>>,
30+
! CPU: %[[TRUE:.*]] = arith.constant true
2931
!fir.shape<1>) -> (!fir.heap<!fir.array<3xi32>>, !fir.heap<!fir.array<3xi32>>)
30-
! CHECK: %[[C0:.*]] = arith.constant 0 : index
31-
! CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
32-
! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
33-
! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
34-
! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
35-
! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
36-
! CHECK: omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
37-
! CHECK: } combiner {
38-
! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
39-
! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
40-
! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
41-
! CHECK: %[[C1:.*]] = arith.constant 1 : index
42-
! CHECK: %[[C3:.*]] = arith.constant 3 : index
43-
! CHECK: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
44-
! CHECK: %[[C1_0:.*]] = arith.constant 1 : index
45-
! CHECK: fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
46-
! CHECK: %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
47-
! CHECK: %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
48-
! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
49-
! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
50-
! CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
51-
! CHECK: fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
52-
! CHECK: }
53-
! CHECK: omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
54-
! CHECK: } cleanup {
55-
! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
56-
! CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
57-
! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
58-
! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
59-
! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64
60-
! CHECK: %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
61-
! CHECK: fir.if %[[VAL_5]] {
62-
! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
63-
! CHECK: fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
64-
! CHECK: }
65-
! CHECK: omp.yield
66-
! CHECK: }
32+
! CPU: %[[C0:.*]] = arith.constant 0 : index
33+
! CPU: %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
34+
! CPU: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1>
35+
! CPU: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap<!fir.array<3xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<3xi32>>
36+
! CPU: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
37+
! CPU: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
38+
! CPU: omp.yield(%[[ALLOC]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
39+
! CPU: } combiner {
40+
! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
41+
! CPU: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
42+
! CPU: %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
43+
! CPU: %[[C1:.*]] = arith.constant 1 : index
44+
! CPU: %[[C3:.*]] = arith.constant 3 : index
45+
! CPU: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1>
46+
! CPU: %[[C1_0:.*]] = arith.constant 1 : index
47+
! CPU: fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered {
48+
! CPU: %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
49+
! CPU: %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
50+
! CPU: %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
51+
! CPU: %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
52+
! CPU: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
53+
! CPU: fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
54+
! CPU: }
55+
! CPU: omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
56+
! CPU: } cleanup {
57+
! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
58+
! CPU: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
59+
! CPU: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.array<3xi32>>) -> !fir.ref<!fir.array<3xi32>>
60+
! CPU: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> i64
61+
! CPU: %[[VAL_4:.*]] = arith.constant 0 : i64
62+
! CPU: %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64
63+
! CPU: fir.if %[[VAL_5]] {
64+
! CPU: %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<3xi32>>) -> !fir.heap<!fir.array<3xi32>>
65+
! CPU: fir.freemem %[[VAL_6]] : !fir.heap<!fir.array<3xi32>>
66+
! CPU: }
67+
! CPU: omp.yield
68+
! CPU: }
69+
70+
! CPU-LABEL: func.func @_QQmain()
71+
! CPU: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
72+
! CPU: %[[VAL_1:.*]] = arith.constant 3 : index
73+
! CPU: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
74+
! CPU: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
75+
! CPU: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
76+
! CPU: %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
77+
! CPU: fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
78+
! CPU: omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
79+
! CPU: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
80+
! CPU: %[[VAL_8:.*]] = arith.constant 1 : i32
81+
! CPU: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
82+
! CPU: %[[VAL_10:.*]] = arith.constant 1 : index
83+
! CPU: %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
84+
! CPU: hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
85+
! CPU: %[[VAL_12:.*]] = arith.constant 2 : i32
86+
! CPU: %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
87+
! CPU: %[[VAL_14:.*]] = arith.constant 2 : index
88+
! CPU: %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
89+
! CPU: hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
90+
! CPU: %[[VAL_16:.*]] = arith.constant 3 : i32
91+
! CPU: %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
92+
! CPU: %[[VAL_18:.*]] = arith.constant 3 : index
93+
! CPU: %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
94+
! CPU: hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
95+
! CPU: omp.terminator
96+
! CPU: }
6797

68-
! CHECK-LABEL: func.func @_QQmain()
69-
! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
70-
! CHECK: %[[VAL_1:.*]] = arith.constant 3 : index
71-
! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
72-
! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
73-
! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
74-
! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
75-
! CHECK: fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
76-
! CHECK: omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
77-
! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
78-
! CHECK: %[[VAL_8:.*]] = arith.constant 1 : i32
79-
! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
80-
! CHECK: %[[VAL_10:.*]] = arith.constant 1 : index
81-
! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
82-
! CHECK: hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
83-
! CHECK: %[[VAL_12:.*]] = arith.constant 2 : i32
84-
! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
85-
! CHECK: %[[VAL_14:.*]] = arith.constant 2 : index
86-
! CHECK: %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
87-
! CHECK: hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
88-
! CHECK: %[[VAL_16:.*]] = arith.constant 3 : i32
89-
! CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
90-
! CHECK: %[[VAL_18:.*]] = arith.constant 3 : index
91-
! CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]]) : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
92-
! CHECK: hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
93-
! CHECK: omp.terminator
94-
! CHECK: }
98+
! GPU: omp.declare_reduction {{.*}} alloc {
99+
! GPU: } init {
100+
! GPU-NOT: fir.allocmem {{.*}} {bindc_name = ".tmp", {{.*}}}
101+
! GPU: fir.alloca {{.*}} {bindc_name = ".tmp"}
102+
! GPU: } combiner {
103+
! GPU: }

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,11 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
11321132
mapInitializationArgs(op, moduleTranslation, reductionDecls,
11331133
reductionVariableMap, i);
11341134

1135+
// TODO In some cases (specially on the GPU), the init regions may
1136+
// contains stack alloctaions. If the region is inlined in a loop, this is
1137+
// problematic. Instead of just inlining the region, handle allocations by
1138+
// hoisting fixed length allocations to the function entry and using
1139+
// stacksave and restore for variable length ones.
11351140
if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
11361141
"omp.reduction.neutral", builder,
11371142
moduleTranslation, &phis)))

0 commit comments

Comments
 (0)