-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[flang][OpenMP] do concurrent: support local on device
#157638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang][OpenMP] do concurrent: support local on device
#157638
Conversation
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Kareem Ergawy (ergawy) ChangesExtends support for mapping Full diff: https://github.com/llvm/llvm-project/pull/157638.diff 3 Files Affected:
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index bc971e8fd6600..fc6eedc6ed4c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3894,6 +3894,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
return getReduceVars().size();
}
+ unsigned getInductionVarsStart() {
+ return 0;
+ }
+
+ unsigned getLocalOperandsStart() {
+ return getNumInductionVars();
+ }
+
+ unsigned getReduceOperandsStart() {
+ return getLocalOperandsStart() + getNumLocalOperands();
+ }
+
mlir::Block::BlockArgListType getInductionVars() {
return getBody()->getArguments().slice(0, getNumInductionVars());
}
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 4358d9d1c9b86..3d2f5896ff4fb 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -138,6 +138,9 @@ void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
liveIns.push_back(operand->get());
});
+
+ for (mlir::Value local : loop.getLocalVars())
+ liveIns.push_back(local);
}
/// Collects values that are local to a loop: "loop-local values". A loop-local
@@ -252,8 +255,7 @@ class DoConcurrentConversion
.getIsTargetDevice();
mlir::omp::TargetOperands targetClauseOps;
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
- loopNestClauseOps,
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
isTargetDevice ? nullptr : &targetClauseOps);
LiveInShapeInfoMap liveInShapeInfoMap;
@@ -275,14 +277,13 @@ class DoConcurrentConversion
}
mlir::omp::ParallelOp parallelOp =
- genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
+ genParallelOp(rewriter, loop, ivInfos, mapper);
// Only set as composite when part of `distribute parallel do`.
parallelOp.setComposite(mapToDevice);
if (!mapToDevice)
- genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
- loopNestClauseOps);
+ genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);
for (mlir::Value local : locals)
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
@@ -291,10 +292,38 @@ class DoConcurrentConversion
if (mapToDevice)
genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
- mlir::omp::LoopNestOp ompLoopNest =
+ auto [loopNestOp, wsLoopOp] =
genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);
+ // `local` region arguments are transferred/cloned from the `do concurrent`
+ // loop to the loopnest op when the region is cloned above. Instead, these
+ // region arguments should be on the workshare loop's region.
+ if (mapToDevice) {
+ for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
+ parallelOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().slice(
+ loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
+ rewriter.replaceAllUsesWith(loopNestArg, parallelArg);
+
+ for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
+ wsLoopOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().slice(
+ loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
+ rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+ } else {
+ for (auto [wsloopArg, loopNestArg] :
+ llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
+ loopNestOp.getRegion().getArguments().drop_front(
+ loopNestClauseOps.loopLowerBounds.size())))
+ rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+ }
+
+ for (unsigned i = 0;
+ i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
+ loopNestOp.getRegion().eraseArgument(
+ loopNestClauseOps.loopLowerBounds.size());
+
rewriter.setInsertionPoint(doLoop);
fir::FirOpBuilder builder(
rewriter,
@@ -315,7 +344,7 @@ class DoConcurrentConversion
// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
- ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+ loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
concurrentLoopsToSkip.insert(doLoop);
});
@@ -371,11 +400,21 @@ class DoConcurrentConversion
llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
mlir::omp::ParallelOp
- genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+ genParallelOp(mlir::ConversionPatternRewriter &rewriter,
+ fir::DoConcurrentLoopOp loop,
looputils::InductionVariableInfos &ivInfos,
mlir::IRMapping &mapper) const {
- auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
- rewriter.createBlock(¶llelOp.getRegion());
+ mlir::omp::ParallelOperands parallelOps;
+
+ if (mapToDevice)
+ genPrivatizers(rewriter, mapper, loop, parallelOps);
+
+ mlir::Location loc = loop.getLoc();
+ auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
+ Fortran::common::openmp::EntryBlockArgs parallelArgs;
+ parallelArgs.priv.vars = parallelOps.privateVars;
+ Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
+ parallelOp.getRegion());
rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
@@ -412,7 +451,7 @@ class DoConcurrentConversion
void genLoopNestClauseOps(
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
- fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+ fir::DoConcurrentLoopOp loop,
mlir::omp::LoopNestOperands &loopNestClauseOps,
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
@@ -443,59 +482,14 @@ class DoConcurrentConversion
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
}
- mlir::omp::LoopNestOp
+ std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
const mlir::omp::LoopNestOperands &clauseOps,
bool isComposite) const {
mlir::omp::WsloopOperands wsloopClauseOps;
-
- auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
- mlir::Region &ompRegion) {
- if (!firRegion.empty()) {
- rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
- auto firYield =
- mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
- rewriter.setInsertionPoint(firYield);
- mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
- firYield.getOperands());
- rewriter.eraseOp(firYield);
- }
- };
-
- // For `local` (and `local_init`) opernads, emit corresponding `private`
- // clauses and attach these clauses to the workshare loop.
- if (!loop.getLocalVars().empty())
- for (auto [op, sym, arg] : llvm::zip_equal(
- loop.getLocalVars(),
- loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
- loop.getRegionLocalArgs())) {
- auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
- sym.getLeafReference());
- if (localizer.getLocalitySpecifierType() ==
- fir::LocalitySpecifierType::LocalInit)
- TODO(localizer.getLoc(),
- "local_init conversion is not supported yet");
-
- mlir::OpBuilder::InsertionGuard guard(rewriter);
- rewriter.setInsertionPointAfter(localizer);
-
- auto privatizer = mlir::omp::PrivateClauseOp::create(
- rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
- localizer.getTypeAttr().getValue(),
- mlir::omp::DataSharingClauseType::Private);
-
- cloneFIRRegionToOMP(localizer.getInitRegion(),
- privatizer.getInitRegion());
- cloneFIRRegionToOMP(localizer.getDeallocRegion(),
- privatizer.getDeallocRegion());
-
- moduleSymbolTable.insert(privatizer);
-
- wsloopClauseOps.privateVars.push_back(op);
- wsloopClauseOps.privateSyms.push_back(
- mlir::SymbolRefAttr::get(privatizer));
- }
+ if (!mapToDevice)
+ genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);
if (!loop.getReduceVars().empty()) {
for (auto [op, byRef, sym, arg] : llvm::zip_equal(
@@ -518,15 +512,15 @@ class DoConcurrentConversion
rewriter, firReducer.getLoc(), ompReducerName,
firReducer.getTypeAttr().getValue());
- cloneFIRRegionToOMP(firReducer.getAllocRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
ompReducer.getAllocRegion());
- cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
ompReducer.getInitializerRegion());
- cloneFIRRegionToOMP(firReducer.getReductionRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
ompReducer.getReductionRegion());
- cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
ompReducer.getAtomicReductionRegion());
- cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
+ cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
ompReducer.getCleanupRegion());
moduleSymbolTable.insert(ompReducer);
}
@@ -558,21 +552,10 @@ class DoConcurrentConversion
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
mlir::omp::YieldOp::create(rewriter, loop->getLoc());
+ loop->getParentOfType<mlir::ModuleOp>().print(
+ llvm::errs(), mlir::OpPrintingFlags().assumeVerified());
- // `local` region arguments are transferred/cloned from the `do concurrent`
- // loop to the loopnest op when the region is cloned above. Instead, these
- // region arguments should be on the workshare loop's region.
- for (auto [wsloopArg, loopNestArg] :
- llvm::zip_equal(wsloopOp.getRegion().getArguments(),
- loopNestOp.getRegion().getArguments().drop_front(
- clauseOps.loopLowerBounds.size())))
- rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
-
- for (unsigned i = 0;
- i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
- loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
-
- return loopNestOp;
+ return {loopNestOp, wsloopOp};
}
void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
@@ -813,6 +796,59 @@ class DoConcurrentConversion
return distOp;
}
+ void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
+ mlir::Region &firRegion,
+ mlir::Region &ompRegion) const {
+ if (!firRegion.empty()) {
+ rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
+ auto firYield =
+ mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
+ rewriter.setInsertionPoint(firYield);
+ mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
+ firYield.getOperands());
+ rewriter.eraseOp(firYield);
+ }
+ }
+
+ void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
+ mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
+ mlir::omp::PrivateClauseOps &privateClauseOps) const {
+ // For `local` (and `local_init`) operands, emit corresponding `private`
+ // clauses and attach these clauses to the workshare loop.
+ if (!loop.getLocalVars().empty())
+ for (auto [var, sym, arg] : llvm::zip_equal(
+ loop.getLocalVars(),
+ loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+ loop.getRegionLocalArgs())) {
+ auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
+ sym.getLeafReference());
+ if (localizer.getLocalitySpecifierType() ==
+ fir::LocalitySpecifierType::LocalInit)
+ TODO(localizer.getLoc(),
+ "local_init conversion is not supported yet");
+
+ mlir::OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.setInsertionPointAfter(localizer);
+
+ auto privatizer = mlir::omp::PrivateClauseOp::create(
+ rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+ localizer.getTypeAttr().getValue(),
+ mlir::omp::DataSharingClauseType::Private);
+
+ cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
+ privatizer.getInitRegion());
+ cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
+ privatizer.getDeallocRegion());
+
+ moduleSymbolTable.insert(privatizer);
+
+ privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
+ : var);
+ privateClauseOps.privateSyms.push_back(
+ mlir::SymbolRefAttr::get(privatizer));
+ }
+ }
+
bool mapToDevice;
llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
mlir::SymbolTable &moduleSymbolTable;
diff --git a/flang/test/Transforms/DoConcurrent/local_device.mlir b/flang/test/Transforms/DoConcurrent/local_device.mlir
new file mode 100644
index 0000000000000..e54bb1aeb414e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/local_device.mlir
@@ -0,0 +1,49 @@
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s
+
+fir.local {type = local} @_QFfooEmy_local_private_f32 : f32
+
+func.func @_QPfoo() {
+ %0 = fir.dummy_scope : !fir.dscope
+ %3 = fir.alloca f32 {bindc_name = "my_local", uniq_name = "_QFfooEmy_local"}
+ %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+
+ fir.do_concurrent {
+ %7 = fir.alloca i32 {bindc_name = "i"}
+ %8:2 = hlfir.declare %7 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+ fir.do_concurrent.loop (%arg0) = (%c1) to (%c10) step (%c1) local(@_QFfooEmy_local_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+ %9 = fir.convert %arg0 : (index) -> i32
+ fir.store %9 to %8#0 : !fir.ref<i32>
+ %10:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+ %cst = arith.constant 4.200000e+01 : f32
+ hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+ }
+ }
+ return
+}
+
+// CHECK: omp.private {type = private} @[[OMP_PRIVATIZER:.*.omp]] : f32
+
+// CHECK: %[[LOCAL_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}my_local"}
+// CHECK: %[[LOCAL_MAP:.*]] = omp.map.info var_ptr(%[[LOCAL_DECL]]#1 : {{.*}})
+
+// CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[LOCAL_MAP]] -> %[[LOCAL_MAP_ARG:.*]] : {{.*}}) {
+// CHECK: %[[LOCAL_DEV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_MAP_ARG]] {uniq_name = "_QFfooEmy_local"}
+
+// CHECK: omp.teams {
+// CHECK: omp.parallel private(@[[OMP_PRIVATIZER]] %[[LOCAL_DEV_DECL]]#0 -> %[[LOCAL_PRIV_ARG:.*]] : {{.*}}) {
+// CHECK: omp.distribute {
+// CHECK: omp.wsloop {
+// CHECK: omp.loop_nest {{.*}} {
+// CHECK: %[[LOCAL_LOOP_DECL:.*]]:2 = hlfir.declare %[[LOCAL_PRIV_ARG]] {uniq_name = "_QFfooEmy_local"}
+// CHECK: hlfir.assign %{{.*}} to %[[LOCAL_LOOP_DECL]]#0
+// CHECK: omp.yield
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
+// CHECK: }
|
6c1a312 to
5d9fe15
Compare
0c93791 to
cbb2c67
Compare
5d9fe15 to
e36db59
Compare
cbb2c67 to
723193b
Compare
…#155987) Upstreams further parts of `do concurrent` to OpenMP conversion pass from AMD's fork. This PR extends the pass by adding support for mapping to the device. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987◀️ - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837
e36db59 to
2177ccc
Compare
723193b to
983b97d
Compare
… tests (#155992) Adds more lit tests for `do concurrent` device mapping. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992◀️ - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837
2177ccc to
fd66849
Compare
983b97d to
5099595
Compare
5099595 to
d67632e
Compare
…nMP mapping (#155993) Adds end-to-end tests for `do concurrent` offloading to the device. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993◀️ - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837
bhandarkar-pranav
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the PR @ergawy. I have one minor nit which you can roll in with any changes that other reviewers might require.
| } | ||
| } | ||
|
|
||
| void genPrivatizers(mlir::ConversionPatternRewriter &rewriter, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add an overarching comment about what the function is doing including any conditions/prereqs for calling this function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Let me know if it could be expanded more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, looks good. Just one (Ultra) nit - Is FIR privatizer the right term or should it be FIR localizer?
Extends support for mapping `do concurrent` on the device by adding support for `local` specifiers. The changes in this PR map the local variable to the `omp.target` op and uses the mapped value as the `private` clause operand in the nested `omp.parallel` op.
d67632e to
c59436f
Compare
mjklemm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
… (#157638) Extends support for mapping `do concurrent` on the device by adding support for `local` specifiers. The changes in this PR map the local variable to the `omp.target` op and uses the mapped value as the `private` clause operand in the nested `omp.parallel` op. - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638◀️ - llvm/llvm-project#156610 - llvm/llvm-project#156837
Extends `do concurrent` to OpenMP device mapping by adding support for mapping `reduce` specifiers to omp `reduction` clauses. The changes attach 2 `reduction` clauses to the mapped OpenMP construct: one on the `teams` part of the construct and one on the `wloop` part. - #155754 - #155987 - #155992 - #155993 - #157638 - #156610◀️ - #156837
…e (#156610) Extends `do concurrent` to OpenMP device mapping by adding support for mapping `reduce` specifiers to omp `reduction` clauses. The changes attach 2 `reduction` clauses to the mapped OpenMP construct: one on the `teams` part of the construct and one on the `wloop` part. - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610◀️ - llvm/llvm-project#156837
…ions on the GPU (#156837) Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837◀️
Extends support for mapping
do concurrenton the device by adding support forlocalspecifiers. The changes in this PR map the local variable to theomp.targetop and uses the mapped value as theprivateclause operand in the nestedomp.parallelop.do concurrentmapping to device #155987do concurrentto device mapping lit tests #155992do concurrent: supportlocalon device #157638do concurrent: supportreduceon device #156610