-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…e the outer most do concurrent loop
|
@llvm/pr-subscribers-flang-fir-hlfir Author: None (harishch4) Changes…e the outer most do concurrent loop Full diff: https://github.com/llvm/llvm-project/pull/111665.diff 3 Files Affected:
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 7f1b93c564b4c4..de9abea3014356 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -362,6 +362,7 @@ struct Evaluation : EvaluationVariant {
bool activeConstruct{false}; // temporarily set for some constructs
mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt)
int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps
+ mlir::Operation *op{nullptr}; // associated mlir operation
};
using ProgramVariant =
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 0894a5903635e1..02871009020dd6 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2012,6 +2012,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
IncrementLoopNestInfo incrementLoopNestInfo;
const Fortran::parser::ScalarLogicalExpr *whileCondition = nullptr;
bool infiniteLoop = !loopControl.has_value();
+ bool isConcurrent = false;
if (infiniteLoop) {
assert(unstructuredContext && "infinite loop must be unstructured");
startBlock(headerBlock);
@@ -2042,6 +2043,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
std::get_if<Fortran::parser::LoopControl::Concurrent>(
&loopControl->u);
assert(concurrent && "invalid DO loop variant");
+ isConcurrent = true;
incrementLoopNestInfo = getConcurrentControl(
std::get<Fortran::parser::ConcurrentHeader>(concurrent->t),
std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent->t));
@@ -2070,7 +2072,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
// Increment loop begin code. (Infinite/while code was already generated.)
if (!infiniteLoop && !whileCondition)
- genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs);
+ genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs,
+ isConcurrent);
// Loop body code.
auto iter = eval.getNestedEvaluations().begin();
@@ -2128,12 +2131,26 @@ class FirConverter : public Fortran::lower::AbstractConverter {
/// Generate FIR to begin a structured or unstructured increment loop nest.
void genFIRIncrementLoopBegin(
IncrementLoopNestInfo &incrementLoopNestInfo,
- llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
+ llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs,
+ bool isConcurrent) {
assert(!incrementLoopNestInfo.empty() && "empty loop nest");
mlir::Location loc = toLocation();
+ Fortran::lower::pft::Evaluation &eval = getEval();
+ Fortran::lower::pft::Evaluation *outermostEval = nullptr;
+ if (isConcurrent) {
+ outermostEval = &eval;
+ while (outermostEval->parentConstruct) {
+ outermostEval = outermostEval->parentConstruct;
+ }
+ }
+ mlir::OpBuilder::InsertPoint insertPt;
for (IncrementLoopInfo &info : incrementLoopNestInfo) {
info.loopVariable =
genLoopVariableAddress(loc, *info.loopVariableSym, info.isUnordered);
+ if (outermostEval && outermostEval->op) {
+ insertPt = builder->saveInsertionPoint();
+ builder->setInsertionPoint(outermostEval->op);
+ }
mlir::Value lowerValue = genControlValue(info.lowerExpr, info);
mlir::Value upperValue = genControlValue(info.upperExpr, info);
bool isConst = true;
@@ -2144,7 +2161,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
info.stepVariable = builder->createTemporary(loc, stepValue.getType());
builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
}
-
+ if (outermostEval && outermostEval->op)
+ builder->restoreInsertionPoint(insertPt);
// Structured loop - generate fir.do_loop.
if (info.isStructured()) {
mlir::Type loopVarType = info.getLoopVariableType();
@@ -2179,6 +2197,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
builder->setInsertionPointToStart(info.doLoop.getBody());
loopValue = info.doLoop.getRegionIterArgs()[0];
}
+ eval.op = info.doLoop;
// Update the loop variable value in case it has non-index references.
builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
if (info.maskExpr) {
diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90
new file mode 100644
index 00000000000000..3a09ed97ebd84d
--- /dev/null
+++ b/flang/test/Lower/do_concurrent.f90
@@ -0,0 +1,50 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+
+! Simple tests for structured concurrent loops with loop-control.
+
+pure function bar(n, m)
+ implicit none
+ integer, intent(in) :: n, m
+ integer :: bar
+ bar = n + m
+end function
+
+subroutine sub1(n)
+ implicit none
+ integer :: n, m, i, j
+ integer, dimension(n) :: a
+!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
+!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
+!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
+!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+ do concurrent(i=1:n, j=1:bar(n*m, n/m))
+ a(i) = n
+ end do
+end subroutine
+
+subroutine sub2(n)
+ implicit none
+ integer :: n, m, i, j
+ integer, dimension(n) :: a
+!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
+!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
+!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
+!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+ do concurrent(i=1:n)
+ do concurrent(j=1:bar(n*m, n/m))
+ a(i) = n
+ end do
+ end do
+end subroutine
|
|
Can you add some explanation on why you need to do that? |
If you have the following multi-range do concurrent(i=1:n, j=1:bar(n*m, n/m))
a(i) = n
end doCurrently, flang generates the following IR: fir.do_loop %arg1 = %42 to %44 step %c1 unordered {
...
%53:3 = hlfir.associate %49 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
%54:3 = hlfir.associate %52 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
%55 = fir.call @_QFPbar(%53#1, %54#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
hlfir.end_associate %53#1, %53#2 : !fir.ref<i32>, i1
hlfir.end_associate %54#1, %54#2 : !fir.ref<i32>, i1
%56 = fir.convert %55 : (i32) -> index
...
fir.do_loop %arg2 = %46 to %56 step %c1_4 unordered {
...
}
}However, if Moreover, the standard describes the execution of From the above 2 points, it seems to me that execution is divided in multiple consecutive stages: 11.1.7.4.2 is the stage where we evaluate all control expressions including the step and then 11.1.7.4.3 is the stage to execute the block of the concurrent loop itself using the combination of possible iteration values. @clementval let me know if this makes sense to you as well. @harishch4 can you please add the above to the description of the PR? |
| bool activeConstruct{false}; // temporarily set for some constructs | ||
| mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt) | ||
| int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps | ||
| mlir::Operation *op{nullptr}; // associated mlir operation |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If something like this is needed, it probably belongs in struct IncrementLoopInfo near the top of Bridge.cpp. But it probably isn't needed - see other comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, there is no one-to-one mapping between a pft::Evaluation and an operation, so it looks weird to me to set this field here in a very generic data structure for a limited use case.
| IncrementLoopNestInfo incrementLoopNestInfo; | ||
| const Fortran::parser::ScalarLogicalExpr *whileCondition = nullptr; | ||
| bool infiniteLoop = !loopControl.has_value(); | ||
| bool isConcurrent = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like the existing IncrementLoopInfo flag isUnordered. That name is the fir terminology. It was originally set up to also be used for the forall case, but now I believe there is a 1-1 correspondence. (If this is needed.)
|
f23 Constraint C1125 A concurrent-limit or concurrent-step in a concurrent-control shall not contain a reference to any index-name in the concurrent-control-list in which it appears. This constraint should allow moving all loop control expressions out of a multi-dimension loop body. But it doesn't in general allow those to be moved even higher, such as above the distinct outer loop in test function Since this is a specific instance of loop invariant code motion, it is likely best to leave this to an optimization pass that does the analysis to not only address this case, but also other cases. Another example is calling a pure function in the control code of a non-doconcurrent loop nested inside an enclosing loop. If there really is a good reason to do this early, the information needed to hoist control code to at most the top of a multiple-dimension loop is local to |
jeanPerier
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with @vdonaldson comments. The patch is trying to solve two different problems at once.
The first one, evaluating all the concurrent limit and steps of a single do concurrent constructs before the related fir.do_loop, I agree makes sense to do in lowering because that is what the language requires. Thanks for working on that.
The second one is a general loop hoisting optimization that cannot be done in general, a FIR/MLIR pass will be needed for that. The main thing FIR operation need to use MLIR code motion utilities is the speculability trait (which obviously cannot blindly be added on all operations).
To solve this, you probably do not even need to add a lot of contexts/new argument, you can likely just solve it internally in genFIRIncrementLoopBegin by either keeping an insertion point before the fir.loops and generating the loop bounds SSA value there like you did, or do a first iteration to generate all these value and keep them in some vectors for later use in the iteration loop that generate the fir.do_loop.
| bool activeConstruct{false}; // temporarily set for some constructs | ||
| mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt) | ||
| int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps | ||
| mlir::Operation *op{nullptr}; // associated mlir operation |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, there is no one-to-one mapping between a pft::Evaluation and an operation, so it looks weird to me to set this field here in a very generic data structure for a limited use case.
| subroutine sub2(n) | ||
| implicit none | ||
| integer :: n, m, i, j | ||
| integer, dimension(n) :: a | ||
| !CHECK: %[[LB1:.*]] = arith.constant 1 : i32 | ||
| !CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index | ||
| !CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32> | ||
| !CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index | ||
| !CHECK: %[[LB2:.*]] = arith.constant 1 : i32 | ||
| !CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index | ||
| !CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32 | ||
| !CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index | ||
| !CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered | ||
| !CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered | ||
| do concurrent(i=1:n) | ||
| do concurrent(j=1:bar(n*m, n/m)) | ||
| a(i) = n | ||
| end do | ||
| end do | ||
| end subroutine |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This case is different than sub1 since 11.1.7.4.2 does not say that the concurrent limit and steps of all nested do concurrent construct must be performed before the outter constructs.
You can have:
subroutine sub3(a, n)
implicit none
integer :: n
integer, dimension(n, n) :: a
do concurrent(integer::i=1:n)
do concurrent(integer::j=1:i)
a(i, j) = n
end do
end do
end subroutine
integer :: a(4,4) = -1
call sub3(a, 4)
print *, a
end
This is a valid program, and with your current patch it will compile to invalid code that will most likely segfault at runtime.
|
Sorry for the delayed respone here, I am taking over the PR from @harishch4 and will look into the comments. |
|
Closed in favor of #114020. |
No description provided.