[flang][OpenMP] Fix crash when a sliced array is specified in forall construct within workshare construct

ceseo · ceseo · commit f3d74aeafc4b · 2025-12-05T20:06:59.000Z
This is fix for two problems: 1. Thread-local variables sometimes are required to be paralellized. Added a special case to handle this in LowerWorkshare.cpp:isSafeToParallelize. 2. Race condition caused by a `nowait` added to the omp.workshare if it is the last operation in a block. This allowed multiple threads to execute the omp.workshare region concurrently. Since _FortranAPushValue modifies a shared stack, this concurrent access causes a crash. Disable the addition of `nowait` and rely on the implicit barrier at the the of the omp.workshare region. Fixes #143330
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -136,8 +136,48 @@ static bool mustParallelizeOp(Operation *op) {
 }
 
 static bool isSafeToParallelize(Operation *op) {
-  return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
-         isMemoryEffectFree(op);
+  if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
+      isMemoryEffectFree(op))
+    return true;
+
+  // Special case: Thread-local variables allocated in the OpenMP parallel
+  // region (using fir.alloca) are private to each thread and thus safe (and
+  // sometimes required) to parallelize. If the compiler wraps such load/stores
+  // in an omp.workshare block, only one thread updates its local copy, while
+  // all other threads read uninitialized data (see issue #143330).
+  if (fir::StoreOp store = dyn_cast<fir::StoreOp>(op)) {
+    Value mem = store.getMemref();
+    while (fir::DeclareOp declare = mem.getDefiningOp<fir::DeclareOp>())
+      mem = declare.getMemref();
+    while (hlfir::DeclareOp declare = mem.getDefiningOp<hlfir::DeclareOp>())
+      mem = declare.getMemref();
+
+    if (fir::AllocaOp alloca = mem.getDefiningOp<fir::AllocaOp>()) {
+      if (mlir::omp::ParallelOp parallelOp =
+              alloca->getParentOfType<mlir::omp::ParallelOp>()) {
+        if (op->getParentOfType<mlir::omp::ParallelOp>() == parallelOp)
+          return true;
+      }
+    }
+  }
+
+  if (fir::LoadOp load = dyn_cast<fir::LoadOp>(op)) {
+    Value mem = load.getMemref();
+    while (fir::DeclareOp declare = mem.getDefiningOp<fir::DeclareOp>())
+      mem = declare.getMemref();
+    while (hlfir::DeclareOp declare = mem.getDefiningOp<hlfir::DeclareOp>())
+      mem = declare.getMemref();
+
+    if (fir::AllocaOp alloca = mem.getDefiningOp<fir::AllocaOp>()) {
+      if (mlir::omp::ParallelOp parallelOp =
+              alloca->getParentOfType<mlir::omp::ParallelOp>()) {
+        if (op->getParentOfType<mlir::omp::ParallelOp>() == parallelOp)
+          return true;
+      }
+    }
+  }
+
+  return false;
 }
 
 /// Simple shallow copies suffice for our purposes in this pass, so we implement
@@ -335,6 +375,11 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
 
     for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
       bool isLast = i + 1 == regions.size();
+      // Make sure shared runtime calls are synchronized: disable `nowait`
+      // insertion, and rely on the implicit barrier at the end of the
+      // omp.workshare block.
+      if (isa<fir::DoLoopOp>(block.getParentOp()))
+        isLast = false;
       if (std::holds_alternative<SingleRegion>(opOrSingle)) {
         OpBuilder singleBuilder(sourceRegion.getContext());
         Block *singleBlock = new Block();