diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp index e878041d37c03..c3a5b6101ce00 100644 --- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp @@ -502,22 +502,37 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Allocating on the heap in case the whole reduction/privatization is nested // inside of a loop - auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); - // if needsDealloc isn't statically false, add cleanup region. Always - // do this for allocatable boxes because they might have been re-allocated - // in the body of the loop/parallel region - - std::optional cstNeedsDealloc = fir::getIntIfConstant(needsDealloc); - assert(cstNeedsDealloc.has_value() && - "createTempFromMold decides this statically"); - if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) { - mlir::OpBuilder::InsertionGuard guard(builder); - createCleanupRegion(converter, loc, argType, cleanupRegion, sym, - isDoConcurrent); - } else { - assert(!isAllocatableOrPointer && - "Pointer-like arrays must be heap allocated"); - } + auto temp = [&]() { + bool shouldAllocateOnStack = false; + + // On the GPU, always allocate on the stack since heap allocatins are very + // expensive. + if (auto offloadMod = llvm::dyn_cast( + *builder.getModule())) + shouldAllocateOnStack = offloadMod.getIsGPU(); + + if (shouldAllocateOnStack) + return createStackTempFromMold(loc, builder, source); + + auto [temp, needsDealloc] = createTempFromMold(loc, builder, source); + // if needsDealloc isn't statically false, add cleanup region. Always + // do this for allocatable boxes because they might have been re-allocated + // in the body of the loop/parallel region + + std::optional cstNeedsDealloc = + fir::getIntIfConstant(needsDealloc); + assert(cstNeedsDealloc.has_value() && + "createTempFromMold decides this statically"); + if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) { + mlir::OpBuilder::InsertionGuard guard(builder); + createCleanupRegion(converter, loc, argType, cleanupRegion, sym, + isDoConcurrent); + } else { + assert(!isAllocatableOrPointer && + "Pointer-like arrays must be heap allocated"); + } + return temp; + }(); // Put the temporary inside of a box: // hlfir::genVariableBox doesn't handle non-default lower bounds diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 index 8e3de498f59c1..4f889d9a4e77f 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90 @@ -1,5 +1,8 @@ -! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s --check-prefix=CPU + +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -o - %s 2>&1 | FileCheck %s --check-prefix=GPU +! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp -fopenmp-is-target-device -o - %s 2>&1 | FileCheck %s --check-prefix=GPU program reduce integer, dimension(3) :: i = 0 @@ -13,81 +16,88 @@ program reduce print *,i end program -! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { -! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box> -! CHECK: omp.yield(%[[VAL_8]] : !fir.ref>>) -! CHECK-LABEL: } init { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): -! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 -! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_4:.*]] = arith.constant 3 : index -! CHECK: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""} -! CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap>, -! CHECK: %[[TRUE:.*]] = arith.constant true +! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref>> alloc { +! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box> +! CPU: omp.yield(%[[VAL_8]] : !fir.ref>>) +! CPU-LABEL: } init { +! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[ALLOC:.*]]: !fir.ref>>): +! CPU: %[[VAL_2:.*]] = arith.constant 0 : i32 +! CPU: %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref>> +! CPU: %[[VAL_4:.*]] = arith.constant 3 : index +! CPU: %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> +! CPU: %[[VAL_1:.*]] = fir.allocmem !fir.array<3xi32> {bindc_name = ".tmp", uniq_name = ""} +! CPU: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.heap>, +! CPU: %[[TRUE:.*]] = arith.constant true !fir.shape<1>) -> (!fir.heap>, !fir.heap>) -! CHECK: %[[C0:.*]] = arith.constant 0 : index -! CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box>, index) -> (index, index, index) -! CHECK: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> -! CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> -! CHECK: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> -! CHECK: omp.yield(%[[ALLOC]] : !fir.ref>>) -! CHECK: } combiner { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): -! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref>> -! CHECK: %[[C1:.*]] = arith.constant 1 : index -! CHECK: %[[C3:.*]] = arith.constant 3 : index -! CHECK: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[C1_0:.*]] = arith.constant 1 : index -! CHECK: fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered { -! CHECK: %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box>, !fir.shapeshift<1>, index) -> !fir.ref -! CHECK: %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box>, !fir.shapeshift<1>, index) -> !fir.ref -! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref -! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref -! CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32 -! CHECK: fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref -! CHECK: } -! CHECK: omp.yield(%[[VAL_0]] : !fir.ref>>) -! CHECK: } cleanup { -! CHECK: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): -! CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref>> -! CHECK: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>) -> !fir.ref> -! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref>) -> i64 -! CHECK: %[[VAL_4:.*]] = arith.constant 0 : i64 -! CHECK: %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64 -! CHECK: fir.if %[[VAL_5]] { -! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref>) -> !fir.heap> -! CHECK: fir.freemem %[[VAL_6]] : !fir.heap> -! CHECK: } -! CHECK: omp.yield -! CHECK: } +! CPU: %[[C0:.*]] = arith.constant 0 : index +! CPU: %[[DIMS:.*]]:3 = fir.box_dims %[[VAL_3]], %[[C0]] : (!fir.box>, index) -> (index, index, index) +! CPU: %[[SHIFT:.*]] = fir.shape_shift %[[DIMS]]#0, %[[DIMS]]#1 : (index, index) -> !fir.shapeshift<1> +! CPU: %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[SHIFT]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.box> +! CPU: hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box> +! CPU: fir.store %[[VAL_7]] to %[[ALLOC]] : !fir.ref>> +! CPU: omp.yield(%[[ALLOC]] : !fir.ref>>) +! CPU: } combiner { +! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref>>, %[[VAL_1:.*]]: !fir.ref>>): +! CPU: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref>> +! CPU: %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +! CPU: %[[C1:.*]] = arith.constant 1 : index +! CPU: %[[C3:.*]] = arith.constant 3 : index +! CPU: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[C1]], %[[C3]] : (index, index) -> !fir.shapeshift<1> +! CPU: %[[C1_0:.*]] = arith.constant 1 : index +! CPU: fir.do_loop %[[VAL_8:.*]] = %[[C1_0]] to %[[C3]] step %[[C1_0]] unordered { +! CPU: %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box>, !fir.shapeshift<1>, index) -> !fir.ref +! CPU: %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[SHAPE_SHIFT]]) %[[VAL_8]] : (!fir.box>, !fir.shapeshift<1>, index) -> !fir.ref +! CPU: %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref +! CPU: %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref +! CPU: %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32 +! CPU: fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref +! CPU: } +! CPU: omp.yield(%[[VAL_0]] : !fir.ref>>) +! CPU: } cleanup { +! CPU: ^bb0(%[[VAL_0:.*]]: !fir.ref>>): +! CPU: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref>> +! CPU: %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>) -> !fir.ref> +! CPU: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref>) -> i64 +! CPU: %[[VAL_4:.*]] = arith.constant 0 : i64 +! CPU: %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_3]], %[[VAL_4]] : i64 +! CPU: fir.if %[[VAL_5]] { +! CPU: %[[VAL_6:.*]] = fir.convert %[[VAL_2]] : (!fir.ref>) -> !fir.heap> +! CPU: fir.freemem %[[VAL_6]] : !fir.heap> +! CPU: } +! CPU: omp.yield +! CPU: } + +! CPU-LABEL: func.func @_QQmain() +! CPU: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref> +! CPU: %[[VAL_1:.*]] = arith.constant 3 : index +! CPU: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> +! CPU: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) +! CPU: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +! CPU: %[[VAL_5:.*]] = fir.alloca !fir.box> +! CPU: fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref>> +! CPU: omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref>>) { +! CPU: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CPU: %[[VAL_8:.*]] = arith.constant 1 : i32 +! CPU: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> +! CPU: %[[VAL_10:.*]] = arith.constant 1 : index +! CPU: %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]]) : (!fir.box>, index) -> !fir.ref +! CPU: hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref +! CPU: %[[VAL_12:.*]] = arith.constant 2 : i32 +! CPU: %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> +! CPU: %[[VAL_14:.*]] = arith.constant 2 : index +! CPU: %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]]) : (!fir.box>, index) -> !fir.ref +! CPU: hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref +! CPU: %[[VAL_16:.*]] = arith.constant 3 : i32 +! CPU: %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> +! CPU: %[[VAL_18:.*]] = arith.constant 3 : index +! CPU: %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]]) : (!fir.box>, index) -> !fir.ref +! CPU: hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref +! CPU: omp.terminator +! CPU: } -! CHECK-LABEL: func.func @_QQmain() -! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref> -! CHECK: %[[VAL_1:.*]] = arith.constant 3 : index -! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) -! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0(%[[VAL_2]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> -! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.box> -! CHECK: fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref>> -! CHECK: omp.parallel reduction(byref @add_reduction_byref_box_3xi32 %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref>>) { -! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) -! CHECK: %[[VAL_8:.*]] = arith.constant 1 : i32 -! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> -! CHECK: %[[VAL_10:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]]) : (!fir.box>, index) -> !fir.ref -! CHECK: hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref -! CHECK: %[[VAL_12:.*]] = arith.constant 2 : i32 -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> -! CHECK: %[[VAL_14:.*]] = arith.constant 2 : index -! CHECK: %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]]) : (!fir.box>, index) -> !fir.ref -! CHECK: hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref -! CHECK: %[[VAL_16:.*]] = arith.constant 3 : i32 -! CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref>> -! CHECK: %[[VAL_18:.*]] = arith.constant 3 : index -! CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]]) : (!fir.box>, index) -> !fir.ref -! CHECK: hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref -! CHECK: omp.terminator -! CHECK: } +! GPU: omp.declare_reduction {{.*}} alloc { +! GPU: } init { +! GPU-NOT: fir.allocmem {{.*}} {bindc_name = ".tmp", {{.*}}} +! GPU: fir.alloca {{.*}} {bindc_name = ".tmp"} +! GPU: } combiner { +! GPU: } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 883d179580e0c..ed88c19ab2c25 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1291,6 +1291,11 @@ initReductionVars(OP op, ArrayRef reductionArgs, mapInitializationArgs(op, moduleTranslation, reductionDecls, reductionVariableMap, i); + // TODO In some cases (specially on the GPU), the init regions may + // contains stack alloctaions. If the region is inlined in a loop, this is + // problematic. Instead of just inlining the region, handle allocations by + // hoisting fixed length allocations to the function entry and using + // stacksave and restore for variable length ones. if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", builder, moduleTranslation, &phis)))