diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index e2f092024c250..6b5e38aa4faed 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -128,4 +128,21 @@ def AutomapToTargetDataPass let dependentDialects = ["mlir::omp::OpenMPDialect"]; } +def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::func::FuncOp"> { + let summary = "Replaces stack allocations with shared memory."; + let description = [{ + `fir.alloca` operations defining values in a target region and then used + inside of an `omp.parallel` region are replaced by this pass with + `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also done for + top-level function `fir.alloca`s used in the same way when the parent + function is a target device function. + + This ensures that explicit private allocations, intended to be shared across + threads, use the proper memory space on a target device while supporting the + case of parallel regions indirectly reached from within a target region via + function calls. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index b85ee7e861a4f..1c224e0785f96 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -11,6 +11,7 @@ add_flang_library(FlangOpenMPTransforms LowerWorkshare.cpp LowerNontemporal.cpp SimdOnly.cpp + StackToShared.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp new file mode 100644 index 0000000000000..e666e2ed8f9b9 --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/StackToShared.cpp @@ -0,0 +1,162 @@ +//===- StackToShared.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements transforms to swap stack allocations on the target +// device with device shared memory where applicable. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" +#include "flang/Optimizer/OpenMP/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h" + +namespace flangomp { +#define GEN_PASS_DEF_STACKTOSHAREDPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +using namespace mlir; + +namespace { +class StackToSharedPass + : public flangomp::impl::StackToSharedPassBase { +public: + StackToSharedPass() = default; + + void runOnOperation() override { + MLIRContext *context = &getContext(); + OpBuilder builder(context); + + func::FuncOp funcOp = getOperation(); + auto offloadIface = funcOp->getParentOfType(); + if (!offloadIface || !offloadIface.getIsTargetDevice()) + return; + + funcOp->walk([&](fir::AllocaOp allocaOp) { + if (!shouldReplaceAlloca(*allocaOp)) + return; + + // Replace fir.alloca with omp.alloc_shared_mem. + builder.setInsertionPoint(allocaOp); + auto sharedAllocOp = omp::AllocSharedMemOp::create( + builder, allocaOp->getLoc(), allocaOp.getResult().getType(), + allocaOp.getInType(), allocaOp.getUniqNameAttr(), + allocaOp.getBindcNameAttr(), allocaOp.getTypeparams(), + allocaOp.getShape()); + allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation()); + allocaOp.erase(); + + // Create a new omp.free_shared_mem for the allocated buffer prior to + // exiting the region. + Block *allocaBlock = sharedAllocOp->getBlock(); + DominanceInfo domInfo; + for (Block &block : sharedAllocOp->getParentRegion()->getBlocks()) { + Operation *terminator = block.getTerminator(); + if (!terminator->hasSuccessors() && + domInfo.dominates(allocaBlock, &block)) { + builder.setInsertionPoint(terminator); + omp::FreeSharedMemOp::create(builder, sharedAllocOp.getLoc(), + sharedAllocOp); + } + } + }); + } + +private: + // TODO: Refactor the logic in `shouldReplaceAlloca` and `checkAllocaUses` to + // be reusable by the MLIR to LLVM IR translation stage, as something very + // similar is also implemented there to choose between allocas and device + // shared memory allocations when processing OpenMP reductions, mapping and + // privatization. + + // Decide whether to replace a fir.alloca with a pair of device shared memory + // allocation/deallocation pair based on the location of the allocation and + // its uses. + // + // In summary, it should be done whenever the allocation is placed outside any + // parallel regions and inside either a target device function or a generic + // kernel, while being used inside of a parallel region. + bool shouldReplaceAlloca(Operation &op) { + auto targetOp = op.getParentOfType(); + + // It must be inside of a generic omp.target or in a target device function, + // and not inside of omp.parallel. + if (auto parallelOp = op.getParentOfType()) { + if (!targetOp || !targetOp->isProperAncestor(parallelOp)) + return false; + } + + if (targetOp) { + if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) != + mlir::omp::TargetExecMode::generic) + return false; + } else { + auto declTargetIface = dyn_cast( + *op.getParentOfType()); + if (!declTargetIface || !declTargetIface.isDeclareTarget() || + declTargetIface.getDeclareTargetDeviceType() == + mlir::omp::DeclareTargetDeviceType::host) + return false; + } + + return checkAllocaUses(op.getUses()); + } + + // When a use takes place inside an omp.parallel region and it's not as a + // private clause argument, or when it is a reduction argument passed to + // omp.parallel, then the defining allocation is eligible for replacement with + // shared memory. + // + // Only one of the uses needs to meet these conditions to return true. + bool checkAllocaUses(const Operation::use_range &uses) { + auto checkUse = [&](const OpOperand &use) { + Operation *owner = use.getOwner(); + auto moduleOp = owner->getParentOfType(); + if (auto parallelOp = dyn_cast(owner)) { + if (llvm::is_contained(parallelOp.getReductionVars(), use.get())) + return true; + } else if (owner->getParentOfType()) { + // If it is used directly inside of a parallel region, it has to be + // replaced unless the use is a private clause. + if (auto argIface = dyn_cast(owner)) { + if (auto privateSyms = llvm::cast_or_null( + owner->getAttr("private_syms"))) { + for (auto [var, sym] : + llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) { + if (var != use.get()) + continue; + + auto privateOp = cast( + moduleOp.lookupSymbol(cast(sym))); + return privateOp.getDataSharingType() != + omp::DataSharingClauseType::Private; + } + } + } + return true; + } + return false; + }; + + // Check direct uses and also follow hlfir.declare uses. + for (const OpOperand &use : uses) { + if (auto declareOp = dyn_cast(use.getOwner())) { + if (checkAllocaUses(declareOp->getUses())) + return true; + } else if (checkUse(use)) { + return true; + } + } + + return false; + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index a83b0665eaf1f..3bcf71c8d3eda 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -335,8 +335,10 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm, pm.addPass(flangomp::createMapInfoFinalizationPass()); pm.addPass(flangomp::createMarkDeclareTargetPass()); pm.addPass(flangomp::createGenericLoopConversionPass()); - if (opts.isTargetDevice) + if (opts.isTargetDevice) { + pm.addPass(flangomp::createStackToSharedPass()); pm.addPass(flangomp::createFunctionFilteringPass()); + } } void createDebugPasses(mlir::PassManager &pm, diff --git a/flang/test/Transforms/OpenMP/stack-to-shared.mlir b/flang/test/Transforms/OpenMP/stack-to-shared.mlir new file mode 100644 index 0000000000000..a7842048a8411 --- /dev/null +++ b/flang/test/Transforms/OpenMP/stack-to-shared.mlir @@ -0,0 +1,215 @@ +// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s + +module attributes {omp.is_target_device = true} { + omp.declare_reduction @add_reduction_i32 : i32 init { + ^bb0(%arg0: i32): + %c0_i32 = arith.constant 0 : i32 + omp.yield(%c0_i32 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = arith.addi %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + + omp.private {type = private} @privatizer_i32 : i32 + omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy { + ^bb0(%arg0: i32, %arg1: i32): + omp.yield(%arg0 : i32) + } + + // Verify that target device functions are searched for allocas shared across + // threads of a parallel region. + // + // Also ensure that all fir.alloca information is adequately forwarded to the + // new allocation, that uses of the allocation through hlfir.declare are + // detected and that only the expected types of uses (parallel reduction and + // non-private uses inside of a parallel region) are replaced. + // CHECK-LABEL: func.func @standalone_func + func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget} { + // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref + %0 = fir.alloca i32 {uniq_name = "x"} + %c = arith.constant 1 : index + // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref> + %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"} + // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) + %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) + // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"} + %2 = fir.alloca i32 {uniq_name = "z"} + // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref + %3 = fir.alloca i32 {uniq_name = "a"} + // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"} + %4 = fir.alloca i32 {uniq_name = "b"} + omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref) { + // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"} + %5 = fir.alloca i32 {uniq_name = "c"} + %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref>, index) + omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref, !fir.ref) { + omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) { + %7 = fir.load %5 : !fir.ref + omp.yield + } + } + omp.terminator + } + %5 = fir.load %4 : !fir.ref + // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref + // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref> + // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref + // CHECK-NEXT: return + return + } + + // Verify that generic target regions are searched for allocas shared across + // threads of a parallel region. + // CHECK-LABEL: func.func @target_generic + func.func @target_generic() { + // CHECK: omp.target + omp.target { + %c = arith.constant 0 : i32 + // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref + %0 = fir.alloca i32 {uniq_name = "x"} + // CHECK: omp.teams + omp.teams { + // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem i32 {uniq_name = "y"} : !fir.ref + %1 = fir.alloca i32 {uniq_name = "y"} + // CHECK: omp.distribute + omp.distribute { + omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) { + // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "z"} : !fir.ref + %2 = fir.alloca i32 {uniq_name = "z"} + // CHECK: omp.parallel + omp.parallel { + %3 = fir.load %0 : !fir.ref + %4 = fir.load %1 : !fir.ref + %5 = fir.load %2 : !fir.ref + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref + // CHECK: omp.yield + omp.yield + } + } + // CHECK: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref + // CHECK: omp.terminator + omp.terminator + } + // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref + // CHECK: omp.terminator + omp.terminator + } + // CHECK: return + return + } + + // Make sure that uses not shared across threads on a parallel region inside + // of target are not incorrectly detected as such if there's another parallel + // region in the host wrapping the whole target region. + // CHECK-LABEL: func.func @target_generic_in_parallel + func.func @target_generic_in_parallel() { + // CHECK-NOT: omp.alloc_shared_mem + // CHECK-NOT: omp.free_shared_mem + omp.parallel { + omp.target { + %c = arith.constant 0 : i32 + %0 = fir.alloca i32 {uniq_name = "x"} + omp.teams { + %1 = fir.alloca i32 {uniq_name = "y"} + omp.distribute { + omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) { + %3 = fir.load %0 : !fir.ref + %4 = fir.load %1 : !fir.ref + omp.parallel { + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + // CHECK: return + return + } + + // Ensure that allocations within SPMD target regions are not replaced with + // device shared memory regardless of use. + // CHECK-LABEL: func.func @target_spmd + func.func @target_spmd() { + // CHECK-NOT: omp.alloc_shared_mem + // CHECK-NOT: omp.free_shared_mem + omp.target { + %c = arith.constant 0 : i32 + %0 = fir.alloca i32 {uniq_name = "x"} + omp.teams { + %1 = fir.alloca i32 {uniq_name = "y"} + omp.parallel { + %2 = fir.alloca i32 {uniq_name = "z"} + %3 = fir.load %0 : !fir.ref + %4 = fir.load %1 : !fir.ref + omp.distribute { + omp.wsloop { + omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) { + %5 = fir.load %2 : !fir.ref + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + // CHECK: return + return + } +} + +// ----- + +// No transformations must be done when targeting the host device. +// CHECK-LABEL: func.func @host_standalone +func.func @host_standalone() { + // CHECK-NOT: omp.alloc_shared_mem + // CHECK-NOT: omp.free_shared_mem + %0 = fir.alloca i32 {uniq_name = "x"} + omp.parallel { + %1 = fir.load %0 : !fir.ref + omp.terminator + } + // CHECK: return + return +} + +// CHECK-LABEL: func.func @host_target +func.func @host_target() { + // CHECK-NOT: omp.alloc_shared_mem + // CHECK-NOT: omp.free_shared_mem + omp.target { + %c = arith.constant 0 : i32 + %0 = fir.alloca i32 {uniq_name = "x"} + omp.teams { + %1 = fir.alloca i32 {uniq_name = "y"} + omp.distribute { + omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) { + %2 = fir.alloca i32 {uniq_name = "z"} + omp.parallel { + %3 = fir.load %0 : !fir.ref + %4 = fir.load %1 : !fir.ref + %5 = fir.load %2 : !fir.ref + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + // CHECK: return + return +}