diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h index 403d79667bf44..feb395f1a12db 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -25,6 +25,11 @@ namespace flangomp { #define GEN_PASS_REGISTRATION #include "flang/Optimizer/OpenMP/Passes.h.inc" +/// Impelements the logic specified in the 2.8.3 workshare Construct section of +/// the OpenMP standard which specifies what statements or constructs shall be +/// divided into units of work. +bool shouldUseWorkshareLowering(mlir::Operation *op); + } // namespace flangomp #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index c070bc22ff20c..37977334c1e9e 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -50,4 +50,9 @@ def FunctionFilteringPass : Pass<"omp-function-filtering"> { ]; } +// Needs to be scheduled on Module as we create functions in it +def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { + let summary = "Lower workshare construct"; +} + #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h index 3b54ac3883858..55fafc2e6b36f 100644 --- a/flang/include/flang/Optimizer/Passes/Pipelines.h +++ b/flang/include/flang/Optimizer/Passes/Pipelines.h @@ -123,7 +123,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm, /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline void createHLFIRToFIRPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel); + mlir::PassManager &pm, bool enableOpenMP, + llvm::OptimizationLevel optLevel = defaultOptLevel); /// Create a pass pipeline for handling certain OpenMP transformations needed /// prior to FIR lowering. diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h index df4b21ada058f..d936b739e5815 100644 --- a/flang/include/flang/Tools/CrossToolHelpers.h +++ b/flang/include/flang/Tools/CrossToolHelpers.h @@ -123,6 +123,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks { false; ///< Set no-signed-zeros-fp-math attribute for functions. bool UnsafeFPMath = false; ///< Set unsafe-fp-math attribute for functions. bool NSWOnLoopVarInc = false; ///< Add nsw flag to loop variable increments. + bool EnableOpenMP = false; ///< Enable OpenMP lowering. }; struct OffloadModuleOpts { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index f2e460fc53a67..8c21fe18e67b4 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -715,7 +715,11 @@ void CodeGenAction::lowerHLFIRToFIR() { pm.enableVerifier(/*verifyPasses=*/true); // Create the pass pipeline - fir::createHLFIRToFIRPassPipeline(pm, level); + fir::createHLFIRToFIRPassPipeline( + pm, + ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP), + level); (void)mlir::applyPassManagerCLOptions(pm); if (!mlir::succeeded(pm.run(*mlirModule))) { @@ -828,6 +832,10 @@ void CodeGenAction::generateLLVMIR() { config.VScaleMax = vsr->second; } + if (ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP)) + config.EnableOpenMP = true; + if (ci.getInvocation().getLoweringOpts().getNSWOnLoopVarInc()) config.NSWOnLoopVarInc = true; diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 035d0d5ca46c7..b1e0dbf6e707e 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FlangOpenMPTransforms MapsForPrivatizedSymbols.cpp MapInfoFinalization.cpp MarkDeclareTarget.cpp + LowerWorkshare.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp new file mode 100644 index 0000000000000..225c585a02d91 --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp @@ -0,0 +1,527 @@ +//===- LowerWorkshare.cpp - special cases for bufferization -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the lowering of omp.workshare to other omp constructs. +// +// This pass is tasked with parallelizing the loops nested in +// workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir +// to fir lowering pipelines are responsible for emitting the +// workshare.loop_wrapper ops where appropriate according to the +// `shouldUseWorkshareLowering` function. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace flangomp { +#define GEN_PASS_DEF_LOWERWORKSHARE +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +#define DEBUG_TYPE "lower-workshare" + +using namespace mlir; + +namespace flangomp { + +// Checks for nesting pattern below as we need to avoid sharing the work of +// statements which are nested in some constructs such as omp.critical or +// another omp.parallel. +// +// omp.workshare { // `wsOp` +// ... +// omp.T { // `parent` +// ... +// `op` +// +template +static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) { + T parent = op->getParentOfType(); + if (!parent) + return false; + return wsOp->isProperAncestor(parent); +} + +bool shouldUseWorkshareLowering(Operation *op) { + auto parentWorkshare = op->getParentOfType(); + + if (!parentWorkshare) + return false; + + if (isNestedIn(parentWorkshare, op)) + return false; + + // 2.8.3 workshare Construct + // For a parallel construct, the construct is a unit of work with respect to + // the workshare construct. The statements contained in the parallel construct + // are executed by a new thread team. + if (isNestedIn(parentWorkshare, op)) + return false; + + // 2.8.2 single Construct + // Binding The binding thread set for a single region is the current team. A + // single region binds to the innermost enclosing parallel region. + // Description Only one of the encountering threads will execute the + // structured block associated with the single construct. + if (isNestedIn(parentWorkshare, op)) + return false; + + // Do not use workshare lowering until we support CFG in omp.workshare + if (parentWorkshare.getRegion().getBlocks().size() != 1) + return false; + + return true; +} + +} // namespace flangomp + +namespace { + +struct SingleRegion { + Block::iterator begin, end; +}; + +static bool mustParallelizeOp(Operation *op) { + return op + ->walk([&](Operation *nested) { + // We need to be careful not to pick up workshare.loop_wrapper in nested + // omp.parallel{omp.workshare} regions, i.e. make sure that `nested` + // binds to the workshare region we are currently handling. + // + // For example: + // + // omp.parallel { + // omp.workshare { // currently handling this + // omp.parallel { + // omp.workshare { // nested workshare + // omp.workshare.loop_wrapper {} + // + // Therefore, we skip if we encounter a nested omp.workshare. + if (isa(op)) + return WalkResult::skip(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted(); +} + +static bool isSafeToParallelize(Operation *op) { + return isa(op) || isa(op) || + isMemoryEffectFree(op); +} + +/// Simple shallow copies suffice for our purposes in this pass, so we implement +/// this simpler alternative to the full fledged `createCopyFunc` in the +/// frontend +static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType, + fir::FirOpBuilder builder) { + mlir::ModuleOp module = builder.getModule(); + auto rt = cast(varType); + mlir::Type eleTy = rt.getEleTy(); + std::string copyFuncName = + fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy"); + + if (auto decl = module.lookupSymbol(copyFuncName)) + return decl; + // create function + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::OpBuilder modBuilder(module.getBodyRegion()); + llvm::SmallVector argsTy = {varType, varType}; + auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {}); + mlir::func::FuncOp funcOp = + modBuilder.create(loc, copyFuncName, funcType); + funcOp.setVisibility(mlir::SymbolTable::Visibility::Private); + builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy, + {loc, loc}); + builder.setInsertionPointToStart(&funcOp.getRegion().back()); + + Value loaded = builder.create(loc, funcOp.getArgument(1)); + builder.create(loc, loaded, funcOp.getArgument(0)); + + builder.create(loc); + return funcOp; +} + +static bool isUserOutsideSR(Operation *user, Operation *parentOp, + SingleRegion sr) { + while (user->getParentOp() != parentOp) + user = user->getParentOp(); + return sr.begin->getBlock() != user->getBlock() || + !(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user)); +} + +static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) { + Block *srBlock = sr.begin->getBlock(); + Operation *parentOp = srBlock->getParentOp(); + + for (auto &use : v.getUses()) { + Operation *user = use.getOwner(); + if (isUserOutsideSR(user, parentOp, sr)) + return true; + + // Now we know user is inside `sr`. + + // Results of nested users cannot be used outside of `sr`. + if (user->getBlock() != srBlock) + continue; + + // A non-safe to parallelize operation will be checked for uses outside + // separately. + if (!isSafeToParallelize(user)) + continue; + + // For safe to parallelize operations, we need to check if there is a + // transitive use of `v` through them. + for (auto res : user->getResults()) + if (isTransitivelyUsedOutside(res, sr)) + return true; + } + return false; +} + +/// We clone pure operations in both the parallel and single blocks. this +/// functions cleans them up if they end up with no uses +static void cleanupBlock(Block *block) { + for (Operation &op : llvm::make_early_inc_range( + llvm::make_range(block->rbegin(), block->rend()))) + if (isOpTriviallyDead(&op)) + op.erase(); +} + +static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, + IRMapping &rootMapping, Location loc, + mlir::DominanceInfo &di) { + OpBuilder rootBuilder(sourceRegion.getContext()); + ModuleOp m = sourceRegion.getParentOfType(); + OpBuilder copyFuncBuilder(m.getBodyRegion()); + fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m); + + auto mapReloadedValue = + [&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder, + OpBuilder parallelBuilder, IRMapping singleMapping) -> Value { + if (auto reloaded = rootMapping.lookupOrNull(v)) + return nullptr; + Type ty = v.getType(); + Value alloc = allocaBuilder.create(loc, ty); + singleBuilder.create(loc, singleMapping.lookup(v), alloc); + Value reloaded = parallelBuilder.create(loc, ty, alloc); + rootMapping.map(v, reloaded); + return alloc; + }; + + auto moveToSingle = + [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder, + OpBuilder parallelBuilder) -> std::pair> { + IRMapping singleMapping = rootMapping; + SmallVector copyPrivate; + bool allParallelized = true; + + for (Operation &op : llvm::make_range(sr.begin, sr.end)) { + if (isSafeToParallelize(&op)) { + singleBuilder.clone(op, singleMapping); + if (llvm::all_of(op.getOperands(), [&](Value opr) { + // Either we have already remapped it + bool remapped = rootMapping.contains(opr); + // Or it is available because it dominates `sr` + bool dominates = + di.properlyDominates(opr.getDefiningOp(), &*sr.begin); + return remapped || dominates; + })) { + // Safe to parallelize operations which have all operands available in + // the root parallel block can be executed there. + parallelBuilder.clone(op, rootMapping); + } else { + // If any operand was not available, it means that there was no + // transitive use of a non-safe-to-parallelize operation outside `sr`. + // This means that there should be no transitive uses outside `sr` of + // `op`. + assert(llvm::all_of(op.getResults(), [&](Value v) { + return !isTransitivelyUsedOutside(v, sr); + })); + allParallelized = false; + } + } else if (auto alloca = dyn_cast(&op)) { + auto hoisted = + cast(allocaBuilder.clone(*alloca, singleMapping)); + rootMapping.map(&*alloca, &*hoisted); + rootMapping.map(alloca.getResult(), hoisted.getResult()); + copyPrivate.push_back(hoisted); + allParallelized = false; + } else { + singleBuilder.clone(op, singleMapping); + // Prepare reloaded values for results of operations that cannot be + // safely parallelized and which are used after the region `sr`. + for (auto res : op.getResults()) { + if (isTransitivelyUsedOutside(res, sr)) { + auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder, + parallelBuilder, singleMapping); + if (alloc) + copyPrivate.push_back(alloc); + } + } + allParallelized = false; + } + } + singleBuilder.create(loc); + return {allParallelized, copyPrivate}; + }; + + for (Block &block : sourceRegion) { + Block *targetBlock = rootBuilder.createBlock( + &targetRegion, {}, block.getArgumentTypes(), + llvm::map_to_vector(block.getArguments(), + [](BlockArgument arg) { return arg.getLoc(); })); + rootMapping.map(&block, targetBlock); + rootMapping.map(block.getArguments(), targetBlock->getArguments()); + } + + auto handleOneBlock = [&](Block &block) { + Block &targetBlock = *rootMapping.lookup(&block); + rootBuilder.setInsertionPointToStart(&targetBlock); + Operation *terminator = block.getTerminator(); + SmallVector> regions; + + auto it = block.begin(); + auto getOneRegion = [&]() { + if (&*it == terminator) + return false; + if (mustParallelizeOp(&*it)) { + regions.push_back(&*it); + it++; + return true; + } + SingleRegion sr; + sr.begin = it; + while (&*it != terminator && !mustParallelizeOp(&*it)) + it++; + sr.end = it; + assert(sr.begin != sr.end); + regions.push_back(sr); + return true; + }; + while (getOneRegion()) + ; + + for (auto [i, opOrSingle] : llvm::enumerate(regions)) { + bool isLast = i + 1 == regions.size(); + if (std::holds_alternative(opOrSingle)) { + OpBuilder singleBuilder(sourceRegion.getContext()); + Block *singleBlock = new Block(); + singleBuilder.setInsertionPointToStart(singleBlock); + + OpBuilder allocaBuilder(sourceRegion.getContext()); + Block *allocaBlock = new Block(); + allocaBuilder.setInsertionPointToStart(allocaBlock); + + OpBuilder parallelBuilder(sourceRegion.getContext()); + Block *parallelBlock = new Block(); + parallelBuilder.setInsertionPointToStart(parallelBlock); + + auto [allParallelized, copyprivateVars] = + moveToSingle(std::get(opOrSingle), allocaBuilder, + singleBuilder, parallelBuilder); + if (allParallelized) { + // The single region was not required as all operations were safe to + // parallelize + assert(copyprivateVars.empty()); + assert(allocaBlock->empty()); + delete singleBlock; + } else { + omp::SingleOperands singleOperands; + if (isLast) + singleOperands.nowait = rootBuilder.getUnitAttr(); + singleOperands.copyprivateVars = copyprivateVars; + cleanupBlock(singleBlock); + for (auto var : singleOperands.copyprivateVars) { + mlir::func::FuncOp funcOp = + createCopyFunc(loc, var.getType(), firCopyFuncBuilder); + singleOperands.copyprivateSyms.push_back( + SymbolRefAttr::get(funcOp)); + } + omp::SingleOp singleOp = + rootBuilder.create(loc, singleOperands); + singleOp.getRegion().push_back(singleBlock); + targetRegion.front().getOperations().splice( + singleOp->getIterator(), allocaBlock->getOperations()); + } + rootBuilder.getInsertionBlock()->getOperations().splice( + rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); + delete allocaBlock; + delete parallelBlock; + } else { + auto op = std::get(opOrSingle); + if (auto wslw = dyn_cast(op)) { + omp::WsloopOperands wsloopOperands; + if (isLast) + wsloopOperands.nowait = rootBuilder.getUnitAttr(); + auto wsloop = + rootBuilder.create(loc, wsloopOperands); + auto clonedWslw = cast( + rootBuilder.clone(*wslw, rootMapping)); + wsloop.getRegion().takeBody(clonedWslw.getRegion()); + clonedWslw->erase(); + } else { + assert(mustParallelizeOp(op)); + Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping); + for (auto [region, clonedRegion] : + llvm::zip(op->getRegions(), cloned->getRegions())) + parallelizeRegion(region, clonedRegion, rootMapping, loc, di); + } + } + } + + rootBuilder.clone(*block.getTerminator(), rootMapping); + }; + + if (sourceRegion.hasOneBlock()) { + handleOneBlock(sourceRegion.front()); + } else { + auto &domTree = di.getDomTree(&sourceRegion); + for (auto node : llvm::breadth_first(domTree.getRootNode())) { + handleOneBlock(*node->getBlock()); + } + } + + for (Block &targetBlock : targetRegion) + cleanupBlock(&targetBlock); +} + +/// Lowers workshare to a sequence of single-thread regions and parallel loops +/// +/// For example: +/// +/// omp.workshare { +/// %a = fir.allocmem +/// omp.workshare.loop_wrapper {} +/// fir.call Assign %b %a +/// fir.freemem %a +/// } +/// +/// becomes +/// +/// %tmp = fir.alloca +/// omp.single copyprivate(%tmp) { +/// %a = fir.allocmem +/// fir.store %a %tmp +/// } +/// %a_reloaded = fir.load %tmp +/// omp.workshare.loop_wrapper {} +/// omp.single { +/// fir.call Assign %b %a_reloaded +/// fir.freemem %a_reloaded +/// } +/// +/// Note that we allocate temporary memory for values in omp.single's which need +/// to be accessed by all threads and broadcast them using single's copyprivate +LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) { + Location loc = wsOp->getLoc(); + IRMapping rootMapping; + + OpBuilder rootBuilder(wsOp); + + // FIXME Currently, we only support workshare constructs with structured + // control flow. The transformation itself supports CFG, however, once we + // transform the MLIR region in the omp.workshare, we need to inline that + // region in the parent block. We have no guarantees at this point of the + // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not + // generally possible. The alternative is to put the lowered region in an + // operation akin to scf.execute_region, which will get lowered at the same + // time when fir ops get lowered to CFG. However, SCF is not registered in + // flang so we cannot use it. Remove this requirement once we have + // scf.execute_region or an alternative operation available. + if (wsOp.getRegion().getBlocks().size() == 1) { + // This operation is just a placeholder which will be erased later. We need + // it because our `parallelizeRegion` function works on regions and not + // blocks. + omp::WorkshareOp newOp = + rootBuilder.create(loc, omp::WorkshareOperands()); + if (!wsOp.getNowait()) + rootBuilder.create(loc); + + parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, + di); + + // Inline the contents of the placeholder workshare op into its parent + // block. + Block *theBlock = &newOp.getRegion().front(); + Operation *term = theBlock->getTerminator(); + Block *parentBlock = wsOp->getBlock(); + parentBlock->getOperations().splice(newOp->getIterator(), + theBlock->getOperations()); + assert(term->getNumOperands() == 0); + term->erase(); + newOp->erase(); + wsOp->erase(); + } else { + // Otherwise just change the operation to an omp.single. + + wsOp->emitWarning( + "omp workshare with unstructured control flow is currently " + "unsupported and will be serialized."); + + // `shouldUseWorkshareLowering` should have guaranteed that there are no + // omp.workshare_loop_wrapper's that bind to this omp.workshare. + assert(!wsOp->walk([&](Operation *op) { + // Nested omp.workshare can have their own + // omp.workshare_loop_wrapper's. + if (isa(op)) + return WalkResult::skip(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted()); + + omp::SingleOperands operands; + operands.nowait = wsOp.getNowaitAttr(); + omp::SingleOp newOp = rootBuilder.create(loc, operands); + + newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(), + wsOp.getRegion().getBlocks()); + wsOp->erase(); + } + return success(); +} + +class LowerWorksharePass + : public flangomp::impl::LowerWorkshareBase { +public: + void runOnOperation() override { + mlir::DominanceInfo &di = getAnalysis(); + getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) { + if (failed(lowerWorkshare(wsOp, di))) + signalPassFailure(); + }); + } +}; +} // namespace diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index a914407991591..31af3531641dd 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -212,7 +212,7 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm, /// \param pm - MLIR pass manager that will hold the pipeline definition /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline -void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, +void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel) { if (optLevel.isOptimizingForSpeed()) { addCanonicalizerPassWithoutRegionSimplification(pm); @@ -230,6 +230,8 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); pm.addPass(hlfir::createConvertHLFIRtoFIR()); + if (enableOpenMP) + pm.addPass(flangomp::createLowerWorkshare()); } /// Create a pass pipeline for handling certain OpenMP transformations needed @@ -303,7 +305,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, void createMLIRToLLVMPassPipeline(mlir::PassManager &pm, MLIRToLLVMPassPipelineConfig &config, llvm::StringRef inputFilename) { - fir::createHLFIRToFIRPassPipeline(pm, config.OptLevel); + fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel); // Add default optimizer pass pipeline. fir::createDefaultFIROptimizerPassPipeline(pm, config); diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index bca454c13ff9c..4b18acb7c2b43 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -47,6 +47,7 @@ func.func @_QQmain() { // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR // PASSES-NEXT: ConvertHLFIRtoFIR +// PASSES-NEXT: LowerWorkshare // PASSES-NEXT: CSE // PASSES-NEXT: (S) 0 num-cse'd - Number of operations CSE'd // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd diff --git a/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir new file mode 100644 index 0000000000000..12b0558d06ed5 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir @@ -0,0 +1,53 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Checks that fir.alloca is hoisted out and copyprivate'd +func.func @wsfunc() { + omp.workshare { + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + %c1_i32 = arith.constant 1 : i32 + %alloc = fir.alloca i32 + fir.store %c1_i32 to %alloc : !fir.ref + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test1"(%alloc) : (!fir.ref) -> () + omp.yield + } + } + "test.test2"(%alloc) : (!fir.ref) -> () + omp.terminator + } + return +} + +// CHECK-LABEL: func.func private @_workshare_copy_i32( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref) { +// CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref +// CHECK: fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref +// CHECK: return +// CHECK: } + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = fir.alloca i32 +// CHECK: omp.single copyprivate(%[[VAL_0]] -> @_workshare_copy_i32 : !fir.ref) { +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 +// CHECK: fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 42 : index +// CHECK: omp.wsloop { +// CHECK: omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_2]]) { +// CHECK: "test.test1"(%[[VAL_0]]) : (!fir.ref) -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.single nowait { +// CHECK: "test.test2"(%[[VAL_0]]) : (!fir.ref) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.barrier +// CHECK: return +// CHECK: } + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir new file mode 100644 index 0000000000000..f1d0e8e229614 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir @@ -0,0 +1,49 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Checks that the omp.workshare.loop_wrapper binds to the correct omp.workshare + +func.func @wsfunc() { + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + omp.parallel { + omp.workshare nowait { + omp.parallel { + omp.workshare nowait { + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test2"() : () -> () + omp.yield + } + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + return +} + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 42 : index +// CHECK: omp.parallel { +// CHECK: omp.single nowait { +// CHECK: omp.parallel { +// CHECK: omp.wsloop nowait { +// CHECK: omp.loop_nest (%[[VAL_2:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_1]]) inclusive step (%[[VAL_0]]) { +// CHECK: "test.test2"() : () -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir new file mode 100644 index 0000000000000..ca288917a3ac4 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir @@ -0,0 +1,57 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we cleanup unused pure operations from the parallel and single +// regions + +// CHECK-LABEL: func.func @wsfunc() { +// CHECK: %[[VAL_0:.*]] = fir.alloca i32 +// CHECK: omp.parallel { +// CHECK: omp.single { +// CHECK: %[[VAL_1:.*]] = "test.test1"() : () -> i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 3 : index +// CHECK: %[[VAL_4:.*]] = arith.addi %[[VAL_2]], %[[VAL_3]] : index +// CHECK: "test.test3"(%[[VAL_4]]) : (index) -> () +// CHECK: omp.terminator +// CHECK: } +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 42 : index +// CHECK: omp.wsloop nowait { +// CHECK: omp.loop_nest (%[[VAL_7:.*]]) : index = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_5]]) { +// CHECK: "test.test2"() : () -> () +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.barrier +// CHECK: omp.terminator +// CHECK: } +// CHECK: return +// CHECK: } +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + %t1 = "test.test1"() : () -> i32 + + %c1 = arith.constant 1 : index + %c42 = arith.constant 42 : index + + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %add = arith.addi %c2, %c3 : index + "test.test3"(%add) : (index) -> () + + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test2"() : () -> () + omp.yield + } + } + omp.terminator + } + omp.terminator + } + return +} + + diff --git a/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir new file mode 100644 index 0000000000000..d7a04e198ceed --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir @@ -0,0 +1,73 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + + +// Check if we store the correct values + +func.func @wsfunc() { + omp.parallel { + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK: fir.alloca + // CHECK-NOT: fir.alloca + omp.workshare { + + %t1 = "test.test1"() : () -> i32 + // CHECK: %[[T1:.*]] = "test.test1" + // CHECK: fir.store %[[T1]] + %t2 = "test.test2"() : () -> i32 + // CHECK: %[[T2:.*]] = "test.test2" + // CHECK: fir.store %[[T2]] + %t3 = "test.test3"() : () -> i32 + // CHECK: %[[T3:.*]] = "test.test3" + // CHECK-NOT: fir.store %[[T3]] + %t4 = "test.test4"() : () -> i32 + // CHECK: %[[T4:.*]] = "test.test4" + // CHECK: fir.store %[[T4]] + %t5 = "test.test5"() : () -> i32 + // CHECK: %[[T5:.*]] = "test.test5" + // CHECK: fir.store %[[T5]] + %t6 = "test.test6"() : () -> i32 + // CHECK: %[[T6:.*]] = "test.test6" + // CHECK-NOT: fir.store %[[T6]] + + + "test.test1"(%t1) : (i32) -> () + "test.test1"(%t2) : (i32) -> () + "test.test1"(%t3) : (i32) -> () + + %true = arith.constant true + fir.if %true { + "test.test2"(%t3) : (i32) -> () + } + + %c1_i32 = arith.constant 1 : i32 + + %t5_pure_use = arith.addi %t5, %c1_i32 : i32 + + %t6_mem_effect_use = "test.test8"(%t6) : (i32) -> i32 + // CHECK: %[[T6_USE:.*]] = "test.test8" + // CHECK: fir.store %[[T6_USE]] + + %c42 = arith.constant 42 : index + %c1 = arith.constant 1 : index + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) { + "test.test10"(%t1) : (i32) -> () + "test.test10"(%t5_pure_use) : (i32) -> () + "test.test10"(%t6_mem_effect_use) : (i32) -> () + omp.yield + } + } + + "test.test10"(%t2) : (i32) -> () + fir.if %true { + "test.test10"(%t4) : (i32) -> () + } + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir new file mode 100644 index 0000000000000..31db8213b5f00 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-correct-parallelize.mlir @@ -0,0 +1,25 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that the safe to parallelize `fir.declare` op will not be parallelized +// due to its operand %alloc not being reloaded outside the omp.single. + +func.func @foo() { + %c0 = arith.constant 0 : index + omp.workshare { + %alloc = fir.allocmem !fir.array, %c0 {bindc_name = ".tmp.forall", uniq_name = ""} + %shape = fir.shape %c0 : (index) -> !fir.shape<1> + %declare = fir.declare %alloc(%shape) {uniq_name = ".tmp.forall"} : (!fir.heap>, !fir.shape<1>) -> !fir.heap> + fir.freemem %alloc : !fir.heap> + omp.terminator + } + return +} + +// CHECK: omp.single nowait +// CHECK: fir.allocmem +// CHECK: fir.shape +// CHECK: fir.declare +// CHECK: fir.freemem +// CHECK: omp.terminator +// CHECK: } +// CHECK: omp.barrier diff --git a/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir new file mode 100644 index 0000000000000..1fd379a6e5eb4 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-no-single.mlir @@ -0,0 +1,19 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we do not emit an omp.single for the constant operation + +func.func @foo() { + omp.workshare { + %c1 = arith.constant 1 : index + omp.workshare.loop_wrapper { + omp.loop_nest (%arg1) : index = (%c1) to (%c1) inclusive step (%c1) { + "test.test0"() : () -> () + omp.yield + } + } + omp.terminator + } + return +} + +// CHECK-NOT: omp.single diff --git a/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir new file mode 100644 index 0000000000000..940662e0bdccc --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir @@ -0,0 +1,23 @@ +// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s + +// Check that we correctly handle nowait + +// CHECK-LABEL: func.func @nonowait +func.func @nonowait(%arg0: !fir.ref>) { + // CHECK: omp.barrier + omp.workshare { + omp.terminator + } + return +} + +// ----- + +// CHECK-LABEL: func.func @nowait +func.func @nowait(%arg0: !fir.ref>) { + // CHECK-NOT: omp.barrier + omp.workshare nowait { + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir new file mode 100644 index 0000000000000..83c49cd635d08 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir @@ -0,0 +1,26 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s + +// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized. + +// CHECK: omp.parallel +// CHECK-NEXT: omp.single + +// TODO Check that the definition of %r dominates its use post-transform +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + ^bb1: + %c1 = arith.constant 1 : i32 + cf.br ^bb3(%c1: i32) + ^bb2: + "test.test2"(%r) : (i32) -> () + omp.terminator + ^bb3(%arg1: i32): + %r = "test.test2"(%arg1) : (i32) -> i32 + cf.br ^bb2 + } + omp.terminator + } + return +} diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir new file mode 100644 index 0000000000000..a27cf88069401 --- /dev/null +++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir @@ -0,0 +1,23 @@ +// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s + +// CHECK: warning: omp workshare with unstructured control flow is currently unsupported and will be serialized. + +// CHECK: omp.parallel +// CHECK-NEXT: omp.single + +// TODO Check transforming a simple CFG +func.func @wsfunc() { + %a = fir.alloca i32 + omp.parallel { + omp.workshare { + ^bb1: + %c1 = arith.constant 1 : i32 + cf.br ^bb3(%c1: i32) + ^bb3(%arg1: i32): + "test.test2"(%arg1) : (i32) -> () + omp.terminator + } + omp.terminator + } + return +} diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index fe5e36f704c76..1c24979bbcdaf 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -452,7 +452,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR( if (emitFIR && useHLFIR) { // lower HLFIR to FIR - fir::createHLFIRToFIRPassPipeline(pm, llvm::OptimizationLevel::O2); + fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, + llvm::OptimizationLevel::O2); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: lowering from HLFIR to FIR failed"; return mlir::failure(); @@ -467,6 +468,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR( // Add O2 optimizer pass pipeline. MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2); + if (enableOpenMP) + config.EnableOpenMP = true; config.NSWOnLoopVarInc = setNSW; fir::registerDefaultInlinerPass(config); fir::createDefaultFIROptimizerPassPipeline(pm, config); diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp index 5c373c4e85258..eaf4bae088454 100644 --- a/flang/tools/tco/tco.cpp +++ b/flang/tools/tco/tco.cpp @@ -139,6 +139,7 @@ compileFIR(const mlir::PassPipelineCLParser &passPipeline) { return mlir::failure(); } else { MLIRToLLVMPassPipelineConfig config(llvm::OptimizationLevel::O2); + config.EnableOpenMP = true; // assume the input contains OpenMP config.AliasAnalysis = true; // enabled when optimizing for speed if (codeGenLLVM) { // Run only CodeGen passes.