Skip to content

Conversation

@ergawy
Copy link
Member

@ergawy ergawy commented Sep 9, 2025

@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Sep 9, 2025
@llvmbot
Copy link
Member

llvmbot commented Sep 9, 2025

@llvm/pr-subscribers-flang-fir-hlfir

Author: Kareem Ergawy (ergawy)

Changes

Extends support for mapping do concurrent on the device by adding support for local specifiers. The changes in this PR map the local variable to the omp.target op and uses the mapped value as the private clause operand in the nested omp.parallel op.


Full diff: https://github.com/llvm/llvm-project/pull/157638.diff

3 Files Affected:

  • (modified) flang/include/flang/Optimizer/Dialect/FIROps.td (+12)
  • (modified) flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp (+114-78)
  • (added) flang/test/Transforms/DoConcurrent/local_device.mlir (+49)
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index bc971e8fd6600..fc6eedc6ed4c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3894,6 +3894,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
       return getReduceVars().size();
     }
 
+    unsigned getInductionVarsStart() {
+      return 0;
+    }
+
+    unsigned getLocalOperandsStart() {
+      return getNumInductionVars();
+    }
+
+    unsigned getReduceOperandsStart() {
+      return getLocalOperandsStart() + getNumLocalOperands();
+    }
+
     mlir::Block::BlockArgListType getInductionVars() {
       return getBody()->getArguments().slice(0, getNumInductionVars());
     }
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 4358d9d1c9b86..3d2f5896ff4fb 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -138,6 +138,9 @@ void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
 
         liveIns.push_back(operand->get());
       });
+
+  for (mlir::Value local : loop.getLocalVars())
+    liveIns.push_back(local);
 }
 
 /// Collects values that are local to a loop: "loop-local values". A loop-local
@@ -252,8 +255,7 @@ class DoConcurrentConversion
               .getIsTargetDevice();
 
       mlir::omp::TargetOperands targetClauseOps;
-      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
-                           loopNestClauseOps,
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
                            isTargetDevice ? nullptr : &targetClauseOps);
 
       LiveInShapeInfoMap liveInShapeInfoMap;
@@ -275,14 +277,13 @@ class DoConcurrentConversion
     }
 
     mlir::omp::ParallelOp parallelOp =
-        genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
+        genParallelOp(rewriter, loop, ivInfos, mapper);
 
     // Only set as composite when part of `distribute parallel do`.
     parallelOp.setComposite(mapToDevice);
 
     if (!mapToDevice)
-      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
-                           loopNestClauseOps);
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);
 
     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
@@ -291,10 +292,38 @@ class DoConcurrentConversion
     if (mapToDevice)
       genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
 
-    mlir::omp::LoopNestOp ompLoopNest =
+    auto [loopNestOp, wsLoopOp] =
         genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);
 
+    // `local` region arguments are transferred/cloned from the `do concurrent`
+    // loop to the loopnest op when the region is cloned above. Instead, these
+    // region arguments should be on the workshare loop's region.
+    if (mapToDevice) {
+      for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
+               parallelOp.getRegion().getArguments(),
+               loopNestOp.getRegion().getArguments().slice(
+                   loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
+        rewriter.replaceAllUsesWith(loopNestArg, parallelArg);
+
+      for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
+               wsLoopOp.getRegion().getArguments(),
+               loopNestOp.getRegion().getArguments().slice(
+                   loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
+        rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+    } else {
+      for (auto [wsloopArg, loopNestArg] :
+           llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
+                           loopNestOp.getRegion().getArguments().drop_front(
+                               loopNestClauseOps.loopLowerBounds.size())))
+        rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+    }
+
+    for (unsigned i = 0;
+         i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
+      loopNestOp.getRegion().eraseArgument(
+          loopNestClauseOps.loopLowerBounds.size());
+
     rewriter.setInsertionPoint(doLoop);
     fir::FirOpBuilder builder(
         rewriter,
@@ -315,7 +344,7 @@ class DoConcurrentConversion
     // Mark `unordered` loops that are not perfectly nested to be skipped from
     // the legality check of the `ConversionTarget` since we are not interested
     // in mapping them to OpenMP.
-    ompLoopNest->walk([&](fir::DoConcurrentOp doLoop) {
+    loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
       concurrentLoopsToSkip.insert(doLoop);
     });
 
@@ -371,11 +400,21 @@ class DoConcurrentConversion
       llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
 
   mlir::omp::ParallelOp
-  genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+  genParallelOp(mlir::ConversionPatternRewriter &rewriter,
+                fir::DoConcurrentLoopOp loop,
                 looputils::InductionVariableInfos &ivInfos,
                 mlir::IRMapping &mapper) const {
-    auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
-    rewriter.createBlock(&parallelOp.getRegion());
+    mlir::omp::ParallelOperands parallelOps;
+
+    if (mapToDevice)
+      genPrivatizers(rewriter, mapper, loop, parallelOps);
+
+    mlir::Location loc = loop.getLoc();
+    auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
+    Fortran::common::openmp::EntryBlockArgs parallelArgs;
+    parallelArgs.priv.vars = parallelOps.privateVars;
+    Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
+                                           parallelOp.getRegion());
     rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
 
     genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
@@ -412,7 +451,7 @@ class DoConcurrentConversion
 
   void genLoopNestClauseOps(
       mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-      fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+      fir::DoConcurrentLoopOp loop,
       mlir::omp::LoopNestOperands &loopNestClauseOps,
       mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
@@ -443,59 +482,14 @@ class DoConcurrentConversion
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
   }
 
-  mlir::omp::LoopNestOp
+  std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
   genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
               fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
     mlir::omp::WsloopOperands wsloopClauseOps;
-
-    auto cloneFIRRegionToOMP = [&rewriter](mlir::Region &firRegion,
-                                           mlir::Region &ompRegion) {
-      if (!firRegion.empty()) {
-        rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
-        auto firYield =
-            mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
-        rewriter.setInsertionPoint(firYield);
-        mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
-                                   firYield.getOperands());
-        rewriter.eraseOp(firYield);
-      }
-    };
-
-    // For `local` (and `local_init`) opernads, emit corresponding `private`
-    // clauses and attach these clauses to the workshare loop.
-    if (!loop.getLocalVars().empty())
-      for (auto [op, sym, arg] : llvm::zip_equal(
-               loop.getLocalVars(),
-               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
-               loop.getRegionLocalArgs())) {
-        auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
-            sym.getLeafReference());
-        if (localizer.getLocalitySpecifierType() ==
-            fir::LocalitySpecifierType::LocalInit)
-          TODO(localizer.getLoc(),
-               "local_init conversion is not supported yet");
-
-        mlir::OpBuilder::InsertionGuard guard(rewriter);
-        rewriter.setInsertionPointAfter(localizer);
-
-        auto privatizer = mlir::omp::PrivateClauseOp::create(
-            rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
-            localizer.getTypeAttr().getValue(),
-            mlir::omp::DataSharingClauseType::Private);
-
-        cloneFIRRegionToOMP(localizer.getInitRegion(),
-                            privatizer.getInitRegion());
-        cloneFIRRegionToOMP(localizer.getDeallocRegion(),
-                            privatizer.getDeallocRegion());
-
-        moduleSymbolTable.insert(privatizer);
-
-        wsloopClauseOps.privateVars.push_back(op);
-        wsloopClauseOps.privateSyms.push_back(
-            mlir::SymbolRefAttr::get(privatizer));
-      }
+    if (!mapToDevice)
+      genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);
 
     if (!loop.getReduceVars().empty()) {
       for (auto [op, byRef, sym, arg] : llvm::zip_equal(
@@ -518,15 +512,15 @@ class DoConcurrentConversion
               rewriter, firReducer.getLoc(), ompReducerName,
               firReducer.getTypeAttr().getValue());
 
-          cloneFIRRegionToOMP(firReducer.getAllocRegion(),
+          cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
                               ompReducer.getAllocRegion());
-          cloneFIRRegionToOMP(firReducer.getInitializerRegion(),
+          cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
                               ompReducer.getInitializerRegion());
-          cloneFIRRegionToOMP(firReducer.getReductionRegion(),
+          cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
                               ompReducer.getReductionRegion());
-          cloneFIRRegionToOMP(firReducer.getAtomicReductionRegion(),
+          cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
                               ompReducer.getAtomicReductionRegion());
-          cloneFIRRegionToOMP(firReducer.getCleanupRegion(),
+          cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
                               ompReducer.getCleanupRegion());
           moduleSymbolTable.insert(ompReducer);
         }
@@ -558,21 +552,10 @@ class DoConcurrentConversion
 
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     mlir::omp::YieldOp::create(rewriter, loop->getLoc());
+    loop->getParentOfType<mlir::ModuleOp>().print(
+        llvm::errs(), mlir::OpPrintingFlags().assumeVerified());
 
-    // `local` region arguments are transferred/cloned from the `do concurrent`
-    // loop to the loopnest op when the region is cloned above. Instead, these
-    // region arguments should be on the workshare loop's region.
-    for (auto [wsloopArg, loopNestArg] :
-         llvm::zip_equal(wsloopOp.getRegion().getArguments(),
-                         loopNestOp.getRegion().getArguments().drop_front(
-                             clauseOps.loopLowerBounds.size())))
-      rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
-
-    for (unsigned i = 0;
-         i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
-      loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
-
-    return loopNestOp;
+    return {loopNestOp, wsloopOp};
   }
 
   void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
@@ -813,6 +796,59 @@ class DoConcurrentConversion
     return distOp;
   }
 
+  void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
+                           mlir::Region &firRegion,
+                           mlir::Region &ompRegion) const {
+    if (!firRegion.empty()) {
+      rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
+      auto firYield =
+          mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
+      rewriter.setInsertionPoint(firYield);
+      mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
+                                 firYield.getOperands());
+      rewriter.eraseOp(firYield);
+    }
+  }
+
+  void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
+                      mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
+                      mlir::omp::PrivateClauseOps &privateClauseOps) const {
+    // For `local` (and `local_init`) operands, emit corresponding `private`
+    // clauses and attach these clauses to the workshare loop.
+    if (!loop.getLocalVars().empty())
+      for (auto [var, sym, arg] : llvm::zip_equal(
+               loop.getLocalVars(),
+               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+               loop.getRegionLocalArgs())) {
+        auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
+            sym.getLeafReference());
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          TODO(localizer.getLoc(),
+               "local_init conversion is not supported yet");
+
+        mlir::OpBuilder::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointAfter(localizer);
+
+        auto privatizer = mlir::omp::PrivateClauseOp::create(
+            rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+            localizer.getTypeAttr().getValue(),
+            mlir::omp::DataSharingClauseType::Private);
+
+        cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
+                            privatizer.getInitRegion());
+        cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
+                            privatizer.getDeallocRegion());
+
+        moduleSymbolTable.insert(privatizer);
+
+        privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
+                                                           : var);
+        privateClauseOps.privateSyms.push_back(
+            mlir::SymbolRefAttr::get(privatizer));
+      }
+  }
+
   bool mapToDevice;
   llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
   mlir::SymbolTable &moduleSymbolTable;
diff --git a/flang/test/Transforms/DoConcurrent/local_device.mlir b/flang/test/Transforms/DoConcurrent/local_device.mlir
new file mode 100644
index 0000000000000..e54bb1aeb414e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/local_device.mlir
@@ -0,0 +1,49 @@
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s
+
+fir.local {type = local} @_QFfooEmy_local_private_f32 : f32
+
+func.func @_QPfoo() {
+  %0 = fir.dummy_scope : !fir.dscope
+  %3 = fir.alloca f32 {bindc_name = "my_local", uniq_name = "_QFfooEmy_local"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c10) step (%c1) local(@_QFfooEmy_local_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+      %10:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEmy_local"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+      %cst = arith.constant 4.200000e+01 : f32
+      hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK: omp.private {type = private} @[[OMP_PRIVATIZER:.*.omp]] : f32
+
+// CHECK: %[[LOCAL_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}my_local"}
+// CHECK: %[[LOCAL_MAP:.*]] = omp.map.info var_ptr(%[[LOCAL_DECL]]#1 : {{.*}})
+
+// CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[LOCAL_MAP]] -> %[[LOCAL_MAP_ARG:.*]] : {{.*}}) {
+// CHECK:   %[[LOCAL_DEV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_MAP_ARG]] {uniq_name = "_QFfooEmy_local"}
+
+// CHECK:   omp.teams {
+// CHECK:     omp.parallel private(@[[OMP_PRIVATIZER]] %[[LOCAL_DEV_DECL]]#0 -> %[[LOCAL_PRIV_ARG:.*]] : {{.*}}) {
+// CHECK:       omp.distribute {
+// CHECK:         omp.wsloop {
+// CHECK:           omp.loop_nest {{.*}} {
+// CHECK:             %[[LOCAL_LOOP_DECL:.*]]:2 = hlfir.declare %[[LOCAL_PRIV_ARG]] {uniq_name = "_QFfooEmy_local"}
+// CHECK:             hlfir.assign %{{.*}} to %[[LOCAL_LOOP_DECL]]#0
+// CHECK:             omp.yield
+// CHECK:           }
+// CHECK:         }
+// CHECK:       }
+// CHECK:     }
+// CHECK:   }
+// CHECK: }

@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_5_offload_tests branch from 6c1a312 to 5d9fe15 Compare September 10, 2025 07:49
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from 0c93791 to cbb2c67 Compare September 10, 2025 07:49
ergawy added a commit that referenced this pull request Sep 10, 2025
Upstreams further parts of `do concurrent` to OpenMP conversion pass
from AMD's fork. This PR extends the pass by adding support for mapping
to the device.

PR stack:
- #155754
- #155987 ◀️
- #155992
- #155993
- #157638
- #156610
- #156837
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_5_offload_tests branch from 5d9fe15 to e36db59 Compare September 10, 2025 18:51
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from cbb2c67 to 723193b Compare September 10, 2025 18:51
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 10, 2025
…#155987)

Upstreams further parts of `do concurrent` to OpenMP conversion pass
from AMD's fork. This PR extends the pass by adding support for mapping
to the device.

PR stack:
- llvm/llvm-project#155754
- llvm/llvm-project#155987 ◀️
- llvm/llvm-project#155992
- llvm/llvm-project#155993
- llvm/llvm-project#157638
- llvm/llvm-project#156610
- llvm/llvm-project#156837
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_5_offload_tests branch from e36db59 to 2177ccc Compare September 13, 2025 11:38
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from 723193b to 983b97d Compare September 13, 2025 11:38
ergawy added a commit that referenced this pull request Sep 16, 2025
…5992)

Adds more lit tests for `do concurrent` device mapping.

PR stack:
- #155754
- #155987
- #155992 ◀️
- #155993
- #157638
- #156610
- #156837
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 16, 2025
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_5_offload_tests branch from 2177ccc to fd66849 Compare September 16, 2025 09:50
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from 983b97d to 5099595 Compare September 16, 2025 09:52
Base automatically changed from users/ergawy/upstream_dc_device_5_offload_tests to main September 17, 2025 05:04
ergawy added a commit that referenced this pull request Sep 17, 2025
…#155993)

Adds end-to-end tests for `do concurrent` offloading to the device.


PR stack:
- #155754
- #155987
- #155992
- #155993 ◀️
- #157638
- #156610
- #156837
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from 5099595 to d67632e Compare September 17, 2025 05:05
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 17, 2025
Copy link
Contributor

@bhandarkar-pranav bhandarkar-pranav left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the PR @ergawy. I have one minor nit which you can roll in with any changes that other reviewers might require.

}
}

void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add an overarching comment about what the function is doing including any conditions/prereqs for calling this function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Let me know if it could be expanded more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, looks good. Just one (Ultra) nit - Is FIR privatizer the right term or should it be FIR localizer?

Extends support for mapping `do concurrent` on the device by adding
support for `local` specifiers. The changes in this PR map the local
variable to the `omp.target` op and uses the mapped value as the
`private` clause operand in the nested `omp.parallel` op.
@ergawy ergawy force-pushed the users/ergawy/upstream_dc_device_6_local_specs_device branch from d67632e to c59436f Compare September 21, 2025 06:31
Copy link
Contributor

@mjklemm mjklemm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@ergawy ergawy merged commit 9008c44 into main Sep 23, 2025
10 checks passed
@ergawy ergawy deleted the users/ergawy/upstream_dc_device_6_local_specs_device branch September 23, 2025 05:27
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 23, 2025
… (#157638)

Extends support for mapping `do concurrent` on the device by adding
support for `local` specifiers. The changes in this PR map the local
variable to the `omp.target` op and uses the mapped value as the
`private` clause operand in the nested `omp.parallel` op.

- llvm/llvm-project#155754
- llvm/llvm-project#155987
- llvm/llvm-project#155992
- llvm/llvm-project#155993
- llvm/llvm-project#157638 ◀️
- llvm/llvm-project#156610
- llvm/llvm-project#156837
ergawy added a commit that referenced this pull request Sep 23, 2025
Extends `do concurrent` to OpenMP device mapping by adding support for
mapping `reduce` specifiers to omp `reduction` clauses. The changes
attach 2 `reduction` clauses to the mapped OpenMP construct: one on the
`teams` part of the construct and one on the `wloop` part.

- #155754
- #155987
- #155992
- #155993
- #157638
- #156610 ◀️
- #156837
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 23, 2025
…e (#156610)

Extends `do concurrent` to OpenMP device mapping by adding support for
mapping `reduce` specifiers to omp `reduction` clauses. The changes
attach 2 `reduction` clauses to the mapped OpenMP construct: one on the
`teams` part of the construct and one on the `wloop` part.

- llvm/llvm-project#155754
- llvm/llvm-project#155987
- llvm/llvm-project#155992
- llvm/llvm-project#155993
- llvm/llvm-project#157638
- llvm/llvm-project#156610 ◀️
- llvm/llvm-project#156837
ergawy added a commit that referenced this pull request Sep 23, 2025
… GPU (#156837)

Fixes a bug related to insertion points when inlining multi-block
combiner reduction regions. The IP at the end of the inlined region was
not used resulting in emitting BBs with multiple terminators.

PR stack:
- #155754
- #155987
- #155992
- #155993
- #157638
- #156610
- #156837 ◀️
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Sep 23, 2025
…ions on the GPU (#156837)

Fixes a bug related to insertion points when inlining multi-block
combiner reduction regions. The IP at the end of the inlined region was
not used resulting in emitting BBs with multiple terminators.

PR stack:
- llvm/llvm-project#155754
- llvm/llvm-project#155987
- llvm/llvm-project#155992
- llvm/llvm-project#155993
- llvm/llvm-project#157638
- llvm/llvm-project#156610
- llvm/llvm-project#156837 ◀️
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

flang:fir-hlfir flang Flang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants