Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions flang/include/flang/Optimizer/Dialect/FIROps.td
Original file line number Diff line number Diff line change
Expand Up @@ -3894,6 +3894,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
return getReduceVars().size();
}

unsigned getInductionVarsStart() {
return 0;
}

unsigned getLocalOperandsStart() {
return getNumInductionVars();
}

unsigned getReduceOperandsStart() {
return getLocalOperandsStart() + getNumLocalOperands();
}

mlir::Block::BlockArgListType getInductionVars() {
return getBody()->getArguments().slice(0, getNumInductionVars());
}
Expand Down
192 changes: 114 additions & 78 deletions flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,

liveIns.push_back(operand->get());
});

for (mlir::Value local : loop.getLocalVars())
liveIns.push_back(local);
}

/// Collects values that are local to a loop: "loop-local values". A loop-local
Expand Down Expand Up @@ -251,8 +254,7 @@ class DoConcurrentConversion
.getIsTargetDevice();

mlir::omp::TargetOperands targetClauseOps;
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
loopNestClauseOps,
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
isTargetDevice ? nullptr : &targetClauseOps);

LiveInShapeInfoMap liveInShapeInfoMap;
Expand All @@ -274,14 +276,13 @@ class DoConcurrentConversion
}

mlir::omp::ParallelOp parallelOp =
genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
genParallelOp(rewriter, loop, ivInfos, mapper);

// Only set as composite when part of `distribute parallel do`.
parallelOp.setComposite(mapToDevice);

if (!mapToDevice)
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
loopNestClauseOps);
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);

for (mlir::Value local : locals)
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
Expand All @@ -290,10 +291,38 @@ class DoConcurrentConversion
if (mapToDevice)
genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);

mlir::omp::LoopNestOp ompLoopNest =
auto [loopNestOp, wsLoopOp] =
genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
/*isComposite=*/mapToDevice);

// `local` region arguments are transferred/cloned from the `do concurrent`
// loop to the loopnest op when the region is cloned above. Instead, these
// region arguments should be on the workshare loop's region.
if (mapToDevice) {
for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
parallelOp.getRegion().getArguments(),
loopNestOp.getRegion().getArguments().slice(
loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
rewriter.replaceAllUsesWith(loopNestArg, parallelArg);

for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
wsLoopOp.getRegion().getArguments(),
loopNestOp.getRegion().getArguments().slice(
loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
} else {
for (auto [wsloopArg, loopNestArg] :
llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
loopNestOp.getRegion().getArguments().drop_front(
loopNestClauseOps.loopLowerBounds.size())))
rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
}

for (unsigned i = 0;
i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
loopNestOp.getRegion().eraseArgument(
loopNestClauseOps.loopLowerBounds.size());

rewriter.setInsertionPoint(doLoop);
fir::FirOpBuilder builder(
rewriter,
Expand All @@ -314,7 +343,7 @@ class DoConcurrentConversion
// Mark `unordered` loops that are not perfectly nested to be skipped from
// the legality check of the `ConversionTarget` since we are not interested
// in mapping them to OpenMP.
ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
concurrentLoopsToSkip.insert(doLoop);
});

Expand Down Expand Up @@ -370,11 +399,21 @@ class DoConcurrentConversion
llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;

mlir::omp::ParallelOp
genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
genParallelOp(mlir::ConversionPatternRewriter &rewriter,
fir::DoConcurrentLoopOp loop,
looputils::InductionVariableInfos &ivInfos,
mlir::IRMapping &mapper) const {
auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
rewriter.createBlock(&parallelOp.getRegion());
mlir::omp::ParallelOperands parallelOps;

if (mapToDevice)
genPrivatizers(rewriter, mapper, loop, parallelOps);

mlir::Location loc = loop.getLoc();
auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
Fortran::common::openmp::EntryBlockArgs parallelArgs;
parallelArgs.priv.vars = parallelOps.privateVars;
Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
parallelOp.getRegion());
rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));

genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
Expand Down Expand Up @@ -411,7 +450,7 @@ class DoConcurrentConversion

void genLoopNestClauseOps(
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
fir::DoConcurrentLoopOp loop,
mlir::omp::LoopNestOperands &loopNestClauseOps,
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
assert(loopNestClauseOps.loopLowerBounds.empty() &&
Expand Down Expand Up @@ -440,59 +479,14 @@ class DoConcurrentConversion
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
}

mlir::omp::LoopNestOp
std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
const mlir::omp::LoopNestOperands &clauseOps,
bool isComposite) const {
mlir::omp::WsloopOperands wsloopClauseOps;

auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
mlir::Region &ompRegion) {
if (!firRegion.empty()) {
rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
auto firYield =
mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
rewriter.setInsertionPoint(firYield);
mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
firYield.getOperands());
rewriter.eraseOp(firYield);
}
};

// For `local` (and `local_init`) opernads, emit corresponding `private`
// clauses and attach these clauses to the workshare loop.
if (!loop.getLocalVars().empty())
for (auto [op, sym, arg] : llvm::zip_equal(
loop.getLocalVars(),
loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
loop.getRegionLocalArgs())) {
auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
sym.getLeafReference());
if (localizer.getLocalitySpecifierType() ==
fir::LocalitySpecifierType::LocalInit)
TODO(localizer.getLoc(),
"local_init conversion is not supported yet");

mlir::OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfter(localizer);

auto privatizer = mlir::omp::PrivateClauseOp::create(
rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
localizer.getTypeAttr().getValue(),
mlir::omp::DataSharingClauseType::Private);

cloneFIRRegionToOMP(localizer.getInitRegion(),
privatizer.getInitRegion());
cloneFIRRegionToOMP(localizer.getDeallocRegion(),
privatizer.getDeallocRegion());

moduleSymbolTable.insert(privatizer);

wsloopClauseOps.privateVars.push_back(op);
wsloopClauseOps.privateSyms.push_back(
mlir::SymbolRefAttr::get(privatizer));
}
if (!mapToDevice)
genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);

if (!loop.getReduceVars().empty()) {
for (auto [op, byRef, sym, arg] : llvm::zip_equal(
Expand All @@ -515,15 +509,15 @@ class DoConcurrentConversion
rewriter, firReducer.getLoc(), ompReducerName,
firReducer.getTypeAttr().getValue());

cloneFIRRegionToOMP(firReducer.getAllocRegion(),
cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
ompReducer.getAllocRegion());
cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
ompReducer.getInitializerRegion());
cloneFIRRegionToOMP(firReducer.getReductionRegion(),
cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
ompReducer.getReductionRegion());
cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
ompReducer.getAtomicReductionRegion());
cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
ompReducer.getCleanupRegion());
moduleSymbolTable.insert(ompReducer);
}
Expand Down Expand Up @@ -555,21 +549,10 @@ class DoConcurrentConversion

rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
mlir::omp::YieldOp::create(rewriter, loop->getLoc());
loop->getParentOfType<mlir::ModuleOp>().print(
llvm::errs(), mlir::OpPrintingFlags().assumeVerified());

// `local` region arguments are transferred/cloned from the `do concurrent`
// loop to the loopnest op when the region is cloned above. Instead, these
// region arguments should be on the workshare loop's region.
for (auto [wsloopArg, loopNestArg] :
llvm::zip_equal(wsloopOp.getRegion().getArguments(),
loopNestOp.getRegion().getArguments().drop_front(
clauseOps.loopLowerBounds.size())))
rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);

for (unsigned i = 0;
i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());

return loopNestOp;
return {loopNestOp, wsloopOp};
}

void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
Expand Down Expand Up @@ -810,6 +793,59 @@ class DoConcurrentConversion
return distOp;
}

void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
mlir::Region &firRegion,
mlir::Region &ompRegion) const {
if (!firRegion.empty()) {
rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
auto firYield =
mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
rewriter.setInsertionPoint(firYield);
mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
firYield.getOperands());
rewriter.eraseOp(firYield);
}
}

void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
mlir::omp::PrivateClauseOps &privateClauseOps) const {
// For `local` (and `local_init`) operands, emit corresponding `private`
// clauses and attach these clauses to the workshare loop.
if (!loop.getLocalVars().empty())
for (auto [var, sym, arg] : llvm::zip_equal(
loop.getLocalVars(),
loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
loop.getRegionLocalArgs())) {
auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
sym.getLeafReference());
if (localizer.getLocalitySpecifierType() ==
fir::LocalitySpecifierType::LocalInit)
TODO(localizer.getLoc(),
"local_init conversion is not supported yet");

mlir::OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfter(localizer);

auto privatizer = mlir::omp::PrivateClauseOp::create(
rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
localizer.getTypeAttr().getValue(),
mlir::omp::DataSharingClauseType::Private);

cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
privatizer.getInitRegion());
cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
privatizer.getDeallocRegion());

moduleSymbolTable.insert(privatizer);

privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
: var);
privateClauseOps.privateSyms.push_back(
mlir::SymbolRefAttr::get(privatizer));
}
}

bool mapToDevice;
llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
mlir::SymbolTable &moduleSymbolTable;
Expand Down
49 changes: 49 additions & 0 deletions flang/test/Transforms/DoConcurrent/local_device.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s

fir.local {type = local} @_QFfooEmy_local_private_f32 : f32

func.func @_QPfoo() {
%0 = fir.dummy_scope : !fir.dscope
%3 = fir.alloca f32 {bindc_name = "my_local", uniq_name = "_QFfooEmy_local"}
%4:2 = hlfir.declare %3 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)

%c1 = arith.constant 1 : index
%c10 = arith.constant 10 : index

fir.do_concurrent {
%7 = fir.alloca i32 {bindc_name = "i"}
%8:2 = hlfir.declare %7 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)

fir.do_concurrent.loop (%arg0) = (%c1) to (%c10) step (%c1) local(@_QFfooEmy_local_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
%9 = fir.convert %arg0 : (index) -> i32
fir.store %9 to %8#0 : !fir.ref<i32>
%10:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
%cst = arith.constant 4.200000e+01 : f32
hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
}
}
return
}

// CHECK: omp.private {type = private} @[[OMP_PRIVATIZER:.*.omp]] : f32

// CHECK: %[[LOCAL_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}my_local"}
// CHECK: %[[LOCAL_MAP:.*]] = omp.map.info var_ptr(%[[LOCAL_DECL]]#1 : {{.*}})

// CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[LOCAL_MAP]] -> %[[LOCAL_MAP_ARG:.*]] : {{.*}}) {
// CHECK: %[[LOCAL_DEV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_MAP_ARG]] {uniq_name = "_QFfooEmy_local"}

// CHECK: omp.teams {
// CHECK: omp.parallel private(@[[OMP_PRIVATIZER]] %[[LOCAL_DEV_DECL]]#0 -> %[[LOCAL_PRIV_ARG:.*]] : {{.*}}) {
// CHECK: omp.distribute {
// CHECK: omp.wsloop {
// CHECK: omp.loop_nest {{.*}} {
// CHECK: %[[LOCAL_LOOP_DECL:.*]]:2 = hlfir.declare %[[LOCAL_PRIV_ARG]] {uniq_name = "_QFfooEmy_local"}
// CHECK: hlfir.assign %{{.*}} to %[[LOCAL_LOOP_DECL]]#0
// CHECK: omp.yield
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }