diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index e2f092024c250..6b5e38aa4faed 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -128,4 +128,21 @@ def AutomapToTargetDataPass
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
+def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::func::FuncOp"> {
+  let summary = "Replaces stack allocations with shared memory.";
+  let description = [{
+    `fir.alloca` operations defining values in a target region and then used
+    inside of an `omp.parallel` region are replaced by this pass with
+    `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also done for
+    top-level function `fir.alloca`s used in the same way when the parent
+    function is a target device function.
+
+    This ensures that explicit private allocations, intended to be shared across
+    threads, use the proper memory space on a target device while supporting the
+    case of parallel regions indirectly reached from within a target region via
+    function calls.
+  }];
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index b85ee7e861a4f..1c224e0785f96 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -11,6 +11,7 @@ add_flang_library(FlangOpenMPTransforms
   LowerWorkshare.cpp
   LowerNontemporal.cpp
   SimdOnly.cpp
+  StackToShared.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
new file mode 100644
index 0000000000000..e666e2ed8f9b9
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
@@ -0,0 +1,162 @@
+//===- StackToShared.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to swap stack allocations on the target
+// device with device shared memory where applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
+
+namespace flangomp {
+#define GEN_PASS_DEF_STACKTOSHAREDPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+using namespace mlir;
+
+namespace {
+class StackToSharedPass
+    : public flangomp::impl::StackToSharedPassBase<StackToSharedPass> {
+public:
+  StackToSharedPass() = default;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    OpBuilder builder(context);
+
+    func::FuncOp funcOp = getOperation();
+    auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
+    if (!offloadIface || !offloadIface.getIsTargetDevice())
+      return;
+
+    funcOp->walk([&](fir::AllocaOp allocaOp) {
+      if (!shouldReplaceAlloca(*allocaOp))
+        return;
+
+      // Replace fir.alloca with omp.alloc_shared_mem.
+      builder.setInsertionPoint(allocaOp);
+      auto sharedAllocOp = omp::AllocSharedMemOp::create(
+          builder, allocaOp->getLoc(), allocaOp.getResult().getType(),
+          allocaOp.getInType(), allocaOp.getUniqNameAttr(),
+          allocaOp.getBindcNameAttr(), allocaOp.getTypeparams(),
+          allocaOp.getShape());
+      allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+      allocaOp.erase();
+
+      // Create a new omp.free_shared_mem for the allocated buffer prior to
+      // exiting the region.
+      Block *allocaBlock = sharedAllocOp->getBlock();
+      DominanceInfo domInfo;
+      for (Block &block : sharedAllocOp->getParentRegion()->getBlocks()) {
+        Operation *terminator = block.getTerminator();
+        if (!terminator->hasSuccessors() &&
+            domInfo.dominates(allocaBlock, &block)) {
+          builder.setInsertionPoint(terminator);
+          omp::FreeSharedMemOp::create(builder, sharedAllocOp.getLoc(),
+                                       sharedAllocOp);
+        }
+      }
+    });
+  }
+
+private:
+  // TODO: Refactor the logic in `shouldReplaceAlloca` and `checkAllocaUses` to
+  // be reusable by the MLIR to LLVM IR translation stage, as something very
+  // similar is also implemented there to choose between allocas and device
+  // shared memory allocations when processing OpenMP reductions, mapping and
+  // privatization.
+
+  // Decide whether to replace a fir.alloca with a pair of device shared memory
+  // allocation/deallocation pair based on the location of the allocation and
+  // its uses.
+  //
+  // In summary, it should be done whenever the allocation is placed outside any
+  // parallel regions and inside either a target device function or a generic
+  // kernel, while being used inside of a parallel region.
+  bool shouldReplaceAlloca(Operation &op) {
+    auto targetOp = op.getParentOfType<omp::TargetOp>();
+
+    // It must be inside of a generic omp.target or in a target device function,
+    // and not inside of omp.parallel.
+    if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
+      if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+        return false;
+    }
+
+    if (targetOp) {
+      if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+          mlir::omp::TargetExecMode::generic)
+        return false;
+    } else {
+      auto declTargetIface = dyn_cast<mlir::omp::DeclareTargetInterface>(
+          *op.getParentOfType<func::FuncOp>());
+      if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+          declTargetIface.getDeclareTargetDeviceType() ==
+              mlir::omp::DeclareTargetDeviceType::host)
+        return false;
+    }
+
+    return checkAllocaUses(op.getUses());
+  }
+
+  // When a use takes place inside an omp.parallel region and it's not as a
+  // private clause argument, or when it is a reduction argument passed to
+  // omp.parallel, then the defining allocation is eligible for replacement with
+  // shared memory.
+  //
+  // Only one of the uses needs to meet these conditions to return true.
+  bool checkAllocaUses(const Operation::use_range &uses) {
+    auto checkUse = [&](const OpOperand &use) {
+      Operation *owner = use.getOwner();
+      auto moduleOp = owner->getParentOfType<ModuleOp>();
+      if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
+        if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+          return true;
+      } else if (owner->getParentOfType<omp::ParallelOp>()) {
+        // If it is used directly inside of a parallel region, it has to be
+        // replaced unless the use is a private clause.
+        if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
+          if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+                  owner->getAttr("private_syms"))) {
+            for (auto [var, sym] :
+                 llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+              if (var != use.get())
+                continue;
+
+              auto privateOp = cast<omp::PrivateClauseOp>(
+                  moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+              return privateOp.getDataSharingType() !=
+                     omp::DataSharingClauseType::Private;
+            }
+          }
+        }
+        return true;
+      }
+      return false;
+    };
+
+    // Check direct uses and also follow hlfir.declare uses.
+    for (const OpOperand &use : uses) {
+      if (auto declareOp = dyn_cast<hlfir::DeclareOp>(use.getOwner())) {
+        if (checkAllocaUses(declareOp->getUses()))
+          return true;
+      } else if (checkUse(use)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index a83b0665eaf1f..3bcf71c8d3eda 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -335,8 +335,10 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(flangomp::createMapInfoFinalizationPass());
   pm.addPass(flangomp::createMarkDeclareTargetPass());
   pm.addPass(flangomp::createGenericLoopConversionPass());
-  if (opts.isTargetDevice)
+  if (opts.isTargetDevice) {
+    pm.addPass(flangomp::createStackToSharedPass());
     pm.addPass(flangomp::createFunctionFilteringPass());
+  }
 }
 
 void createDebugPasses(mlir::PassManager &pm,
diff --git a/flang/test/Transforms/OpenMP/stack-to-shared.mlir b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
new file mode 100644
index 0000000000000..a7842048a8411
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
@@ -0,0 +1,215 @@
+// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  omp.declare_reduction @add_reduction_i32 : i32 init {
+  ^bb0(%arg0: i32):
+    %c0_i32 = arith.constant 0 : i32
+    omp.yield(%c0_i32 : i32)
+  } combiner {
+  ^bb0(%arg0: i32, %arg1: i32):
+    %0 = arith.addi %arg0, %arg1 : i32
+    omp.yield(%0 : i32)
+  }
+
+  omp.private {type = private} @privatizer_i32 : i32
+  omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy {
+  ^bb0(%arg0: i32, %arg1: i32):
+    omp.yield(%arg0 : i32)
+  }
+
+  // Verify that target device functions are searched for allocas shared across
+  // threads of a parallel region.
+  //
+  // Also ensure that all fir.alloca information is adequately forwarded to the
+  // new allocation, that uses of the allocation through hlfir.declare are
+  // detected and that only the expected types of uses (parallel reduction and
+  // non-private uses inside of a parallel region) are replaced.
+  // CHECK-LABEL: func.func @standalone_func
+  func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+    // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+    %0 = fir.alloca i32 {uniq_name = "x"}
+    %c = arith.constant 1 : index
+    // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref<!fir.char<1,?>>
+    %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"}
+    // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"}
+    %2 = fir.alloca i32 {uniq_name = "z"}
+    // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref<i32>
+    %3 = fir.alloca i32 {uniq_name = "a"}
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
+    %4 = fir.alloca i32 {uniq_name = "b"}
+    omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
+      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"}
+      %5 = fir.alloca i32 {uniq_name = "c"}
+      %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+      omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
+        omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
+          %7 = fir.load %5 : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    %5 = fir.load %4 : !fir.ref<i32>
+    // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+    // CHECK-NEXT: return
+    return
+  }
+
+  // Verify that generic target regions are searched for allocas shared across
+  // threads of a parallel region.
+  // CHECK-LABEL: func.func @target_generic
+  func.func @target_generic() {
+    // CHECK: omp.target
+    omp.target {
+      %c = arith.constant 0 : i32
+      // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+      %0 = fir.alloca i32 {uniq_name = "x"}
+      // CHECK: omp.teams
+      omp.teams {
+        // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem i32 {uniq_name = "y"} : !fir.ref<i32>
+        %1 = fir.alloca i32 {uniq_name = "y"}
+        // CHECK: omp.distribute
+        omp.distribute {
+          omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+            // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "z"} : !fir.ref<i32>
+            %2 = fir.alloca i32 {uniq_name = "z"}
+            // CHECK: omp.parallel
+            omp.parallel {
+              %3 = fir.load %0 : !fir.ref<i32>
+              %4 = fir.load %1 : !fir.ref<i32>
+              %5 = fir.load %2 : !fir.ref<i32>
+              // CHECK: omp.terminator
+              omp.terminator
+            }
+            // CHECK: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+            // CHECK: omp.yield
+            omp.yield
+          }
+        }
+        // CHECK: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<i32>
+        // CHECK: omp.terminator
+        omp.terminator
+      }
+      // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+
+  // Make sure that uses not shared across threads on a parallel region inside
+  // of target are not incorrectly detected as such if there's another parallel
+  // region in the host wrapping the whole target region.
+  // CHECK-LABEL: func.func @target_generic_in_parallel
+  func.func @target_generic_in_parallel() {
+    // CHECK-NOT: omp.alloc_shared_mem
+    // CHECK-NOT: omp.free_shared_mem
+    omp.parallel {
+      omp.target {
+        %c = arith.constant 0 : i32
+        %0 = fir.alloca i32 {uniq_name = "x"}
+        omp.teams {
+          %1 = fir.alloca i32 {uniq_name = "y"}
+          omp.distribute {
+            omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+              %3 = fir.load %0 : !fir.ref<i32>
+              %4 = fir.load %1 : !fir.ref<i32>
+              omp.parallel {
+                omp.terminator
+              }
+              omp.yield
+            }
+          }
+          omp.terminator
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+
+  // Ensure that allocations within SPMD target regions are not replaced with
+  // device shared memory regardless of use.
+  // CHECK-LABEL: func.func @target_spmd
+  func.func @target_spmd() {
+    // CHECK-NOT: omp.alloc_shared_mem
+    // CHECK-NOT: omp.free_shared_mem
+    omp.target {
+      %c = arith.constant 0 : i32
+      %0 = fir.alloca i32 {uniq_name = "x"}
+      omp.teams {
+        %1 = fir.alloca i32 {uniq_name = "y"}
+        omp.parallel {
+          %2 = fir.alloca i32 {uniq_name = "z"}
+          %3 = fir.load %0 : !fir.ref<i32>
+          %4 = fir.load %1 : !fir.ref<i32>
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+                %5 = fir.load %2 : !fir.ref<i32>
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+}
+
+// -----
+
+// No transformations must be done when targeting the host device.
+// CHECK-LABEL: func.func @host_standalone
+func.func @host_standalone() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  %0 = fir.alloca i32 {uniq_name = "x"}
+  omp.parallel {
+    %1 = fir.load %0 : !fir.ref<i32>
+    omp.terminator
+  }
+  // CHECK: return
+  return
+}
+
+// CHECK-LABEL: func.func @host_target
+func.func @host_target() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  omp.target {
+    %c = arith.constant 0 : i32
+    %0 = fir.alloca i32 {uniq_name = "x"}
+    omp.teams {
+      %1 = fir.alloca i32 {uniq_name = "y"}
+      omp.distribute {
+        omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+          %2 = fir.alloca i32 {uniq_name = "z"}
+          omp.parallel {
+            %3 = fir.load %0 : !fir.ref<i32>
+            %4 = fir.load %1 : !fir.ref<i32>
+            %5 = fir.load %2 : !fir.ref<i32>
+            omp.terminator
+          }
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  // CHECK: return
+  return
+}