Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 103 additions & 93 deletions flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ namespace flangomp {
#include "flang/Optimizer/OpenMP/Passes.h.inc"
} // namespace flangomp

#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
#define DEBUG_TYPE "do-concurrent-conversion"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")

namespace Fortran {
namespace lower {
Expand Down Expand Up @@ -175,9 +176,24 @@ bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
return false;
}

/// For the \p doLoop parameter, find the operations that declares its induction
/// variable or allocates memory for it.
mlir::Operation *findLoopIndVarMemDecl(fir::DoLoopOp doLoop) {
mlir::Value result = nullptr;
mlir::visitUsedValuesDefinedAbove(
doLoop.getRegion(), [&](mlir::OpOperand *operand) {
if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
assert(result == nullptr &&
"loop can have only one induction variable");
result = operand->get();
}
});

assert(result != nullptr && result.getDefiningOp() != nullptr);
return result.getDefiningOp();
}

/// Collect the list of values used inside the loop but defined outside of it.
/// The first item in the returned list is always the loop's induction
/// variable.
void collectLoopLiveIns(fir::DoLoopOp doLoop,
llvm::SmallVectorImpl<mlir::Value> &liveIns) {
llvm::SmallDenseSet<mlir::Value> seenValues;
Expand All @@ -194,9 +210,6 @@ void collectLoopLiveIns(fir::DoLoopOp doLoop,
return;

liveIns.push_back(operand->get());

if (isIndVarUltimateOperand(operand->getOwner(), doLoop))
std::swap(*liveIns.begin(), *liveIns.rbegin());
});
}

Expand Down Expand Up @@ -286,24 +299,78 @@ void collectIndirectConstOpChain(mlir::Operation *link,
opChain.insert(link);
}

/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
/// there are no operations in \p outerloop's other than:
///
/// 1. the operations needed to assing/update \p outerLoop's induction variable.
/// 2. \p innerLoop itself.
///

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we also need to test:

  1. Computing the inner DoLoop's lb/ub/step does not depend on the outer loop's indvar
  2. No instructions that have side-effects

[idea] It would be possible to just test whether the instructions can be sunk into the inner loop: it is not really relevant whether those instructions depend on the outer loop's indvar.

Copy link
Author

@ergawy ergawy Nov 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Computing the inner DoLoop's lb/ub/step does not depend on the outer loop's indvar

For loop nests (i.e. multi-range do concurrent loops) this is prohibited by the spec:

C1125 (R1126) A concurrent-limit or concurrent-step in a concurrent-control shall not contain a reference to
any index-name in the concurrent-control-list in which it appears.

And we emit conforming loop nests now after llvm#114020 so no need to check for that (this is what the backward slices were used for in the earlier PR).

No instructions that have side-effects

You mean sibling instructions to the inner loop (i.e. either before or after it within the outer loop)? Or instruction within the body of the innermost loop in the nest?

/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
/// according to the above definition.
bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
mlir::BackwardSliceOptions backwardSliceOptions;
backwardSliceOptions.inclusive = true;
// We will collect the backward slices for innerLoop's LB, UB, and step.
// However, we want to limit the scope of these slices to the scope of
// outerLoop's region.
backwardSliceOptions.filter = [&](mlir::Operation *op) {
return !mlir::areValuesDefinedAbove(op->getResults(),
outerLoop.getRegion());
};
Comment on lines +311 to +319

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The backwardSliceOptions is not used anywhere?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Luckily not anymore. We don't have to extract the slices relevant to innerLoop's LB, UB, and step since they are now hoised for loop nests by llvm#114020.


mlir::ForwardSliceOptions forwardSliceOptions;
forwardSliceOptions.inclusive = true;
// We don't care about the outer-loop's induction variable's uses within the
// inner-loop, so we filter out these uses.
forwardSliceOptions.filter = [&](mlir::Operation *op) {
return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
};

llvm::SetVector<mlir::Operation *> indVarSlice;
mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
forwardSliceOptions);
llvm::DenseSet<mlir::Operation *> innerLoopSetupOpsSet(indVarSlice.begin(),
indVarSlice.end());

llvm::DenseSet<mlir::Operation *> loopBodySet;
outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
if (op == outerLoop)
return mlir::WalkResult::advance();

if (op == innerLoop)
return mlir::WalkResult::skip();

if (mlir::isa<fir::ResultOp>(op))
return mlir::WalkResult::advance();

loopBodySet.insert(op);
return mlir::WalkResult::advance();
});

bool result = (loopBodySet == innerLoopSetupOpsSet);
mlir::Location loc = outerLoop.getLoc();
LLVM_DEBUG(DBGS() << "Loop pair starting at location " << loc << " is"
<< (result ? "" : " not") << " perfectly nested\n");

return result;
}

/// Starting with `outerLoop` collect a perfectly nested loop nest, if any. This
/// function collects as much as possible loops in the nest; it case it fails to
/// recognize a certain nested loop as part of the nest it just returns the
/// parent loops it discovered before.
mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,
mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
LoopNestToIndVarMap &loopNest) {
assert(outerLoop.getUnordered());
llvm::SmallVector<mlir::Value> outerLoopLiveIns;
collectLoopLiveIns(outerLoop, outerLoopLiveIns);
assert(currentLoop.getUnordered());

while (true) {
loopNest.try_emplace(
outerLoop,
currentLoop,
InductionVariableInfo{
outerLoopLiveIns.front().getDefiningOp(),
std::move(looputils::extractIndVarUpdateOps(outerLoop))});
findLoopIndVarMemDecl(currentLoop),
std::move(looputils::extractIndVarUpdateOps(currentLoop))});

auto directlyNestedLoops = outerLoop.getRegion().getOps<fir::DoLoopOp>();
auto directlyNestedLoops = currentLoop.getRegion().getOps<fir::DoLoopOp>();
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;

for (auto nestedLoop : directlyNestedLoops)
Expand All @@ -318,69 +385,10 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,

fir::DoLoopOp nestedUnorderedLoop = unorderedLoops.front();

if ((nestedUnorderedLoop.getLowerBound().getDefiningOp() == nullptr) ||
(nestedUnorderedLoop.getUpperBound().getDefiningOp() == nullptr) ||
(nestedUnorderedLoop.getStep().getDefiningOp() == nullptr))
return mlir::failure();

llvm::SmallVector<mlir::Value> nestedLiveIns;
collectLoopLiveIns(nestedUnorderedLoop, nestedLiveIns);

llvm::DenseSet<mlir::Value> outerLiveInsSet;
llvm::DenseSet<mlir::Value> nestedLiveInsSet;

// Returns a "unified" view of an mlir::Value. This utility checks if the
// value is defined by an op, and if so, return the first value defined by
// that op (if there are many), otherwise just returns the value.
//
// This serves the purpose that if, for example, `%op_res#0` is used in the
// outer loop and `%op_res#1` is used in the nested loop (or vice versa),
// that we detect both as the same value. If we did not do so, we might
// falesely detect that the 2 loops are not perfectly nested since they use
// "different" sets of values.
auto getUnifiedLiveInView = [](mlir::Value liveIn) {
return liveIn.getDefiningOp() != nullptr
? liveIn.getDefiningOp()->getResult(0)
: liveIn;
};

// Re-package both lists of live-ins into sets so that we can use set
// equality to compare the values used in the outerloop vs. the nestd one.

for (auto liveIn : nestedLiveIns)
nestedLiveInsSet.insert(getUnifiedLiveInView(liveIn));

mlir::Value outerLoopIV;
for (auto liveIn : outerLoopLiveIns) {
outerLiveInsSet.insert(getUnifiedLiveInView(liveIn));

// Keep track of the IV of the outerloop. See `isPerfectlyNested` for more
// info on the reason.
if (outerLoopIV == nullptr)
outerLoopIV = getUnifiedLiveInView(liveIn);
}

// For the 2 loops to be perfectly nested, either:
// * both would have exactly the same set of live-in values or,
// * the outer loop would have exactly 1 extra live-in value: the outer
// loop's induction variable; this happens when the outer loop's IV is
// *not* referenced in the nested loop.
bool isPerfectlyNested = [&]() {
if (outerLiveInsSet == nestedLiveInsSet)
return true;

if ((outerLiveInsSet.size() == nestedLiveIns.size() + 1) &&
!nestedLiveInsSet.contains(outerLoopIV))
return true;

return false;
}();

if (!isPerfectlyNested)
if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
return mlir::failure();

outerLoop = nestedUnorderedLoop;
outerLoopLiveIns = std::move(nestedLiveIns);
currentLoop = nestedUnorderedLoop;
}

return mlir::success();
Expand Down Expand Up @@ -554,10 +562,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
"defining operation.");
}

llvm::SmallVector<mlir::Value> outermostLoopLiveIns;
looputils::collectLoopLiveIns(doLoop, outermostLoopLiveIns);
assert(!outermostLoopLiveIns.empty());

looputils::LoopNestToIndVarMap loopNest;
bool hasRemainingNestedLoops =
failed(looputils::collectLoopNest(doLoop, loopNest));
Expand All @@ -566,15 +570,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
"Some `do concurent` loops are not perfectly-nested. "
"These will be serialzied.");

llvm::SmallVector<mlir::Value> loopNestLiveIns;
looputils::collectLoopLiveIns(loopNest.back().first, loopNestLiveIns);
assert(!loopNestLiveIns.empty());

llvm::SetVector<mlir::Value> locals;
looputils::collectLoopLocalValues(loopNest.back().first, locals);
// We do not want to map "loop-local" values to the device through
// `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
outermostLoopLiveIns.erase(llvm::remove_if(outermostLoopLiveIns,
[&](mlir::Value liveIn) {
return locals.contains(liveIn);
}),
outermostLoopLiveIns.end());
loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
[&](mlir::Value liveIn) {
return locals.contains(liveIn);
}),
loopNestLiveIns.end());

looputils::sinkLoopIVArgs(rewriter, loopNest);

Expand All @@ -590,24 +598,25 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
loopNestClauseOps, &targetClauseOps);

// Prevent mapping host-evaluated variables.
outermostLoopLiveIns.erase(
llvm::remove_if(outermostLoopLiveIns,
loopNestLiveIns.erase(
llvm::remove_if(loopNestLiveIns,
[&](mlir::Value liveIn) {
return llvm::is_contained(
targetClauseOps.hostEvalVars, liveIn);
}),
outermostLoopLiveIns.end());
loopNestLiveIns.end());

// The outermost loop will contain all the live-in values in all nested
// loops since live-in values are collected recursively for all nested
// ops.
for (mlir::Value liveIn : outermostLoopLiveIns)
for (mlir::Value liveIn : loopNestLiveIns)
targetClauseOps.mapVars.push_back(
genMapInfoOpForLiveIn(rewriter, liveIn));

targetOp =
genTargetOp(doLoop.getLoc(), rewriter, mapper, outermostLoopLiveIns,
genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
targetClauseOps, loopNestClauseOps);

genTeamsOp(doLoop.getLoc(), rewriter);
}

Expand Down Expand Up @@ -998,10 +1007,11 @@ class DoConcurrentConversionPass
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
concurrentLoopsToSkip);
mlir::ConversionTarget target(*context);
target.addLegalDialect<
fir::FIROpsDialect, hlfir::hlfirDialect, mlir::arith::ArithDialect,
mlir::func::FuncDialect, mlir::omp::OpenMPDialect,
mlir::cf::ControlFlowDialect, mlir::math::MathDialect>();
target
.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
mlir::arith::ArithDialect, mlir::func::FuncDialect,
mlir::omp::OpenMPDialect, mlir::cf::ControlFlowDialect,
mlir::math::MathDialect, mlir::LLVM::LLVMDialect>();

target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
Expand Down
87 changes: 87 additions & 0 deletions flang/test/Transforms/DoConcurrent/loop_nest_test.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
! Tests loop-nest detection algorithm for do-concurrent mapping.

! REQUIRES: asserts

! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host \
! RUN: -mmlir -debug %s -o - 2> %t.log || true

! RUN: FileCheck %s < %t.log

program main
implicit none

contains

subroutine foo(n)
implicit none
integer :: n, m
integer :: i, j, k
integer :: x
integer, dimension(n) :: a
integer, dimension(n, n, n) :: b

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
do concurrent(i=1:n, j=1:bar(n*m, n/m))
a(i) = n
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m))
a(i) = n
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=bar(n, x):n)
do concurrent(j=1:bar(n*m, n/m))
a(i) = n
end do
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n)
x = 10
do concurrent(j=1:m)
b(i,j,k) = i * j + k
end do
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n)
do concurrent(j=1:m)
b(i,j,k) = i * j + k
end do
x = 10
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is not perfectly nested
do concurrent(i=1:n)
do concurrent(j=1:m)
b(i,j,k) = i * j + k
x = 10
end do
end do

! CHECK: Loop pair starting at location
! CHECK: loc("{{.*}}":[[# @LINE + 1]]:{{.*}}) is perfectly nested
do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m), k=1:bar(n*m, bar(n*m, n/m)))
a(i) = n
end do


end subroutine

pure function bar(n, m)
implicit none
integer, intent(in) :: n, m
integer :: bar

bar = n + m
end function

end program main
Loading