diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 0854c21ee152a..2f1ed34a3f6d6 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -303,12 +303,14 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { } void __kmpc_barrier(IdentTy *Loc, int32_t TId) { - if (mapping::isMainThreadInGenericMode()) - return __kmpc_flush(Loc); - if (mapping::isSPMDMode()) return __kmpc_barrier_simple_spmd(Loc, TId); + // Generic parallel regions are run with multiple of the warp size or single + // threaded, in the latter case we need to stop here. + if (omp_get_num_threads() == 1) + return __kmpc_flush(Loc); + impl::namedBarrier(); } diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c new file mode 100644 index 0000000000000..8ee6b51fb6818 --- /dev/null +++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c @@ -0,0 +1,21 @@ +// RUN: %libomptarget-compile-run-and-check-generic +// RUN: %libomptarget-compileopt-run-and-check-generic + +#include +#include + +int main() { + int b = 0; + +#pragma omp target map(tofrom : b) + for (int i = 1; i <= 10; ++i) { +#pragma omp parallel num_threads(10) reduction(+ : b) +#pragma omp for + for (int k = 0; k < 10; ++k) + ++b; + } + + // CHECK: b: 100 + printf("b: %i\n", b); + return 0; +} diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c new file mode 100644 index 0000000000000..a98abd6922da7 --- /dev/null +++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c @@ -0,0 +1,23 @@ +// RUN: %libomptarget-compile-run-and-check-generic +// FIXME: This fails with optimization enabled and prints b: 0 +// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic + +#include +#include + +int main() { + int b = 0; + +#pragma omp target map(tofrom : b) thread_limit(256) + for (int i = 1; i <= 1; ++i) { +#pragma omp parallel num_threads(64) reduction(+ : b) +#pragma omp parallel num_threads(10) reduction(+ : b) +#pragma omp for + for (int k = 0; k < 10; ++k) + ++b; + } + + // CHECK: b: 640 + printf("b: %i\n", b); + return 0; +}