[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665

harishch4 · 2024-10-09T11:53:18Z

No description provided.

…e the outer most do concurrent loop

llvmbot · 2024-10-09T11:53:55Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: None (harishch4)

Changes

…e the outer most do concurrent loop

Full diff: https://github.com/llvm/llvm-project/pull/111665.diff

3 Files Affected:

(modified) flang/include/flang/Lower/PFTBuilder.h (+1)
(modified) flang/lib/Lower/Bridge.cpp (+22-3)
(added) flang/test/Lower/do_concurrent.f90 (+50)

diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 7f1b93c564b4c4..de9abea3014356 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -362,6 +362,7 @@ struct Evaluation : EvaluationVariant {
   bool activeConstruct{false}; // temporarily set for some constructs
   mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt)
   int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps
+  mlir::Operation *op{nullptr}; // associated mlir operation
 };
 
 using ProgramVariant =
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 0894a5903635e1..02871009020dd6 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2012,6 +2012,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     IncrementLoopNestInfo incrementLoopNestInfo;
     const Fortran::parser::ScalarLogicalExpr *whileCondition = nullptr;
     bool infiniteLoop = !loopControl.has_value();
+    bool isConcurrent = false;
     if (infiniteLoop) {
       assert(unstructuredContext && "infinite loop must be unstructured");
       startBlock(headerBlock);
@@ -2042,6 +2043,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           std::get_if<Fortran::parser::LoopControl::Concurrent>(
               &loopControl->u);
       assert(concurrent && "invalid DO loop variant");
+      isConcurrent = true;
       incrementLoopNestInfo = getConcurrentControl(
           std::get<Fortran::parser::ConcurrentHeader>(concurrent->t),
           std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent->t));
@@ -2070,7 +2072,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     // Increment loop begin code. (Infinite/while code was already generated.)
     if (!infiniteLoop && !whileCondition)
-      genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs);
+      genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs,
+                               isConcurrent);
 
     // Loop body code.
     auto iter = eval.getNestedEvaluations().begin();
@@ -2128,12 +2131,26 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Generate FIR to begin a structured or unstructured increment loop nest.
   void genFIRIncrementLoopBegin(
       IncrementLoopNestInfo &incrementLoopNestInfo,
-      llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
+      llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs,
+      bool isConcurrent) {
     assert(!incrementLoopNestInfo.empty() && "empty loop nest");
     mlir::Location loc = toLocation();
+    Fortran::lower::pft::Evaluation &eval = getEval();
+    Fortran::lower::pft::Evaluation *outermostEval = nullptr;
+    if (isConcurrent) {
+      outermostEval = &eval;
+      while (outermostEval->parentConstruct) {
+        outermostEval = outermostEval->parentConstruct;
+      }
+    }
+    mlir::OpBuilder::InsertPoint insertPt;
     for (IncrementLoopInfo &info : incrementLoopNestInfo) {
       info.loopVariable =
           genLoopVariableAddress(loc, *info.loopVariableSym, info.isUnordered);
+      if (outermostEval && outermostEval->op) {
+        insertPt = builder->saveInsertionPoint();
+        builder->setInsertionPoint(outermostEval->op);
+      }
       mlir::Value lowerValue = genControlValue(info.lowerExpr, info);
       mlir::Value upperValue = genControlValue(info.upperExpr, info);
       bool isConst = true;
@@ -2144,7 +2161,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         info.stepVariable = builder->createTemporary(loc, stepValue.getType());
         builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
       }
-
+      if (outermostEval && outermostEval->op)
+        builder->restoreInsertionPoint(insertPt);
       // Structured loop - generate fir.do_loop.
       if (info.isStructured()) {
         mlir::Type loopVarType = info.getLoopVariableType();
@@ -2179,6 +2197,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           builder->setInsertionPointToStart(info.doLoop.getBody());
           loopValue = info.doLoop.getRegionIterArgs()[0];
         }
+        eval.op = info.doLoop;
         // Update the loop variable value in case it has non-index references.
         builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
         if (info.maskExpr) {
diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90
new file mode 100644
index 00000000000000..3a09ed97ebd84d
--- /dev/null
+++ b/flang/test/Lower/do_concurrent.f90
@@ -0,0 +1,50 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+
+! Simple tests for structured concurrent loops with loop-control.
+
+pure function bar(n, m)
+   implicit none
+   integer, intent(in) :: n, m
+   integer :: bar
+   bar = n + m
+end function
+
+subroutine sub1(n)
+   implicit none
+   integer :: n, m, i, j
+   integer, dimension(n) :: a
+!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
+!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
+!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
+!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+   do concurrent(i=1:n, j=1:bar(n*m, n/m))
+      a(i) = n
+   end do
+end subroutine
+
+subroutine sub2(n)
+   implicit none
+   integer :: n, m, i, j
+   integer, dimension(n) :: a
+!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
+!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
+!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
+!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+   do concurrent(i=1:n)
+      do concurrent(j=1:bar(n*m, n/m))
+         a(i) = n
+      end do
+   end do
+end subroutine

clementval · 2024-10-09T15:40:09Z

Can you add some explanation on why you need to do that?

ergawy · 2024-10-16T03:33:24Z

Can you add some explanation of why you need to do that?

If you have the following multi-range do concurrent loop:

  do concurrent(i=1:n, j=1:bar(n*m, n/m))
    a(i) = n
  end do

Currently, flang generates the following IR:

    fir.do_loop %arg1 = %42 to %44 step %c1 unordered {
      ...
      %53:3 = hlfir.associate %49 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
      %54:3 = hlfir.associate %52 {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
      %55 = fir.call @_QFPbar(%53#1, %54#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
      hlfir.end_associate %53#1, %53#2 : !fir.ref<i32>, i1
      hlfir.end_associate %54#1, %54#2 : !fir.ref<i32>, i1
      %56 = fir.convert %55 : (i32) -> index
      ...
      fir.do_loop %arg2 = %46 to %56 step %c1_4 unordered {
        ...
      }
    }

However, if bar is impure, then we have a direct violation of the standard:

C1143 A reference to an impure procedure shall not appear within a DO CONCURRENT construct.

Moreover, the standard describes the execution of do concurrent construct in multiple stages:

11.1.7.4 Execution of a DO construct
...
11.1.7.4.2 DO CONCURRENT loop control
The concurrent-limit and concurrent-step expressions in the concurrent-control-list are evaluated. ...

11.1.7.4.3 The execution cycle
...
The block of a DO CONCURRENT construct is executed for every active combination of the index-name values.
Each execution of the block is an iteration. The executions may occur in any order.

From the above 2 points, it seems to me that execution is divided in multiple consecutive stages: 11.1.7.4.2 is the stage where we evaluate all control expressions including the step and then 11.1.7.4.3 is the stage to execute the block of the concurrent loop itself using the combination of possible iteration values.

@clementval let me know if this makes sense to you as well. @harishch4 can you please add the above to the description of the PR?

vdonaldson · 2024-10-16T21:51:28Z

flang/include/flang/Lower/PFTBuilder.h

  bool activeConstruct{false}; // temporarily set for some constructs
  mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt)
  int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps
+  mlir::Operation *op{nullptr}; // associated mlir operation


If something like this is needed, it probably belongs in struct IncrementLoopInfo near the top of Bridge.cpp. But it probably isn't needed - see other comments.

+1, there is no one-to-one mapping between a pft::Evaluation and an operation, so it looks weird to me to set this field here in a very generic data structure for a limited use case.

vdonaldson · 2024-10-16T21:55:00Z

flang/lib/Lower/Bridge.cpp

    IncrementLoopNestInfo incrementLoopNestInfo;
    const Fortran::parser::ScalarLogicalExpr *whileCondition = nullptr;
    bool infiniteLoop = !loopControl.has_value();
+    bool isConcurrent = false;


This looks like the existing IncrementLoopInfo flag isUnordered. That name is the fir terminology. It was originally set up to also be used for the forall case, but now I believe there is a 1-1 correspondence. (If this is needed.)

vdonaldson · 2024-10-16T22:35:28Z

f23 Constraint C1125 A concurrent-limit or concurrent-step in a concurrent-control shall not contain a reference to any index-name in the concurrent-control-list in which it appears.

This constraint should allow moving all loop control expressions out of a multi-dimension loop body. But it doesn't in general allow those to be moved even higher, such as above the distinct outer loop in test function sub2. Doing that would require additional analysis.

Since this is a specific instance of loop invariant code motion, it is likely best to leave this to an optimization pass that does the analysis to not only address this case, but also other cases. Another example is calling a pure function in the control code of a non-doconcurrent loop nested inside an enclosing loop.

If there really is a good reason to do this early, the information needed to hoist control code to at most the top of a multiple-dimension loop is local to genFIRIncrementLoopBegin. The code should probably be local there.

jeanPerier

I agree with @vdonaldson comments. The patch is trying to solve two different problems at once.
The first one, evaluating all the concurrent limit and steps of a single do concurrent constructs before the related fir.do_loop, I agree makes sense to do in lowering because that is what the language requires. Thanks for working on that.

The second one is a general loop hoisting optimization that cannot be done in general, a FIR/MLIR pass will be needed for that. The main thing FIR operation need to use MLIR code motion utilities is the speculability trait (which obviously cannot blindly be added on all operations).

To solve this, you probably do not even need to add a lot of contexts/new argument, you can likely just solve it internally in genFIRIncrementLoopBegin by either keeping an insertion point before the fir.loops and generating the loop bounds SSA value there like you did, or do a first iteration to generate all these value and keep them in some vectors for later use in the iteration loop that generate the fir.do_loop.

jeanPerier · 2024-10-17T07:48:50Z

flang/include/flang/Lower/PFTBuilder.h

  bool activeConstruct{false}; // temporarily set for some constructs
  mlir::Block *block{nullptr}; // isNewBlock block (ActionStmt, ConstructStmt)
  int printIndex{0}; // (ActionStmt, ConstructStmt) evaluation index for dumps
+  mlir::Operation *op{nullptr}; // associated mlir operation


+1, there is no one-to-one mapping between a pft::Evaluation and an operation, so it looks weird to me to set this field here in a very generic data structure for a limited use case.

jeanPerier · 2024-10-17T07:59:31Z

flang/test/Lower/do_concurrent.f90

+subroutine sub2(n)
+   implicit none
+   integer :: n, m, i, j
+   integer, dimension(n) :: a
+!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
+!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
+!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
+!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
+!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
+!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
+!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
+!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
+!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
+   do concurrent(i=1:n)
+      do concurrent(j=1:bar(n*m, n/m))
+         a(i) = n
+      end do
+   end do
+end subroutine


This case is different than sub1 since 11.1.7.4.2 does not say that the concurrent limit and steps of all nested do concurrent construct must be performed before the outter constructs.

You can have:

subroutine sub3(a, n) implicit none integer :: n integer, dimension(n, n) :: a do concurrent(integer::i=1:n) do concurrent(integer::j=1:i) a(i, j) = n end do end do end subroutine integer :: a(4,4) = -1 call sub3(a, 4) print *, a end

This is a valid program, and with your current patch it will compile to invalid code that will most likely segfault at runtime.

ergawy · 2024-10-29T08:17:09Z

Sorry for the delayed respone here, I am taking over the PR from @harishch4 and will look into the comments.

ergawy · 2024-10-29T09:49:41Z

Closed in favor of #114020.

[Flang] Hoist concurrent-limit and concurrent-step expressions outsid…

5e7fcb0

…e the outer most do concurrent loop

harishch4 requested review from NimishMishra, Thirumalai-Shaktivel, ergawy and kiranktp October 9, 2024 11:53

llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Oct 9, 2024

ergawy requested a review from mjklemm October 10, 2024 12:51

ergawy requested a review from clementval October 16, 2024 03:43

clementval requested review from jeanPerier and vdonaldson October 16, 2024 19:02

vdonaldson reviewed Oct 16, 2024

View reviewed changes

jeanPerier requested changes Oct 17, 2024

View reviewed changes

ergawy changed the title ~~[Flang] Hoist concurrent-limit and concurrent-step expressions outsid…~~ [Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop Oct 29, 2024

ergawy mentioned this pull request Oct 29, 2024

[flang][MLIR] Hoist do concurrent nest bounds/steps outside the nest #114020

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665

[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665

harishch4 commented Oct 9, 2024 •

edited by ergawy

Loading

Uh oh!

llvmbot commented Oct 9, 2024

Uh oh!

clementval commented Oct 9, 2024 •

edited

Loading

Uh oh!

ergawy commented Oct 16, 2024

Uh oh!

vdonaldson Oct 16, 2024

Uh oh!

jeanPerier Oct 17, 2024

Uh oh!

vdonaldson Oct 16, 2024

Uh oh!

vdonaldson commented Oct 16, 2024

Uh oh!

jeanPerier left a comment

Uh oh!

jeanPerier Oct 17, 2024

Uh oh!

jeanPerier Oct 17, 2024

Uh oh!

ergawy commented Oct 29, 2024

Uh oh!

ergawy commented Oct 29, 2024

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665

Are you sure you want to change the base?

[Flang] Hoist concurrent-limit and concurrent-step expressions outside the outer most do concurrent loop #111665

Conversation

harishch4 commented Oct 9, 2024 • edited by ergawy Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 9, 2024

Uh oh!

clementval commented Oct 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ergawy commented Oct 16, 2024

Uh oh!

vdonaldson Oct 16, 2024

Choose a reason for hiding this comment

Uh oh!

jeanPerier Oct 17, 2024

Choose a reason for hiding this comment

Uh oh!

vdonaldson Oct 16, 2024

Choose a reason for hiding this comment

Uh oh!

vdonaldson commented Oct 16, 2024

Uh oh!

jeanPerier left a comment

Choose a reason for hiding this comment

Uh oh!

jeanPerier Oct 17, 2024

Choose a reason for hiding this comment

Uh oh!

jeanPerier Oct 17, 2024

Choose a reason for hiding this comment

Uh oh!

ergawy commented Oct 29, 2024

Uh oh!

ergawy commented Oct 29, 2024

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

harishch4 commented Oct 9, 2024 •

edited by ergawy

Loading

clementval commented Oct 9, 2024 •

edited

Loading