diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 0854c21ee152a..2f1ed34a3f6d6 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -303,12 +303,14 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
 }
 
 void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
-  if (mapping::isMainThreadInGenericMode())
-    return __kmpc_flush(Loc);
-
   if (mapping::isSPMDMode())
     return __kmpc_barrier_simple_spmd(Loc, TId);
 
+  // Generic parallel regions are run with multiple of the warp size or single
+  // threaded, in the latter case we need to stop here.
+  if (omp_get_num_threads() == 1)
+    return __kmpc_flush(Loc);
+
   impl::namedBarrier();
 }
 
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
new file mode 100644
index 0000000000000..8ee6b51fb6818
--- /dev/null
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
@@ -0,0 +1,21 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int b = 0;
+
+#pragma omp target map(tofrom : b)
+  for (int i = 1; i <= 10; ++i) {
+#pragma omp parallel num_threads(10) reduction(+ : b)
+#pragma omp for
+    for (int k = 0; k < 10; ++k)
+      ++b;
+  }
+
+  // CHECK: b: 100
+  printf("b: %i\n", b);
+  return 0;
+}
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
new file mode 100644
index 0000000000000..a98abd6922da7
--- /dev/null
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// FIXME: This fails with optimization enabled and prints b: 0
+// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int b = 0;
+
+#pragma omp target map(tofrom : b) thread_limit(256)
+  for (int i = 1; i <= 1; ++i) {
+#pragma omp parallel num_threads(64) reduction(+ : b)
+#pragma omp parallel num_threads(10) reduction(+ : b)
+#pragma omp for
+    for (int k = 0; k < 10; ++k)
+      ++b;
+  }
+
+  // CHECK: b: 640
+  printf("b: %i\n", b);
+  return 0;
+}