Skip to content

Commit 8cfb8b6

Browse files
WeiqunZhangatmyers
andauthored
Add ParallelForOMP (#4595)
ParallelFor does not do any OMP threading, because we do coarse-grained OMP threading at the MFIter level with tiling. But there are cases we use ParallelFor outside MFIter and they could benefit from OMP threading. Thus, we add new ParallelForOMP functions in this PR. Note that OMP threading is used only for CPU builds with OMP, otherwise the new ParallelForOMP functions are equivalent to ParallelFor functions. --------- Co-authored-by: Andrew Myers <[email protected]>
1 parent 9d63d59 commit 8cfb8b6

File tree

8 files changed

+269
-160
lines changed

8 files changed

+269
-160
lines changed

Src/Base/AMReX_GpuLaunch.H

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,4 +241,113 @@ namespace Gpu {
241241

242242
#include <AMReX_CTOParallelForImpl.H>
243243

244+
namespace amrex {
245+
246+
#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)
247+
248+
template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
249+
void ParallelForOMP (T n, L const& f) noexcept
250+
{
251+
ParallelFor(n, f);
252+
}
253+
254+
template <typename L>
255+
void ParallelForOMP (Box const& box, L const& f) noexcept
256+
{
257+
ParallelFor(box, f);
258+
}
259+
260+
template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
261+
void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
262+
{
263+
ParallelFor(box, ncomp, f);
264+
}
265+
266+
#else /* !defined(AMREX_USE_GPU) && defined(AMREX_USE_OMP) */
267+
268+
template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
269+
AMREX_ATTRIBUTE_FLATTEN_FOR
270+
void ParallelForOMP (T n, L const& f) noexcept
271+
{
272+
#pragma omp parallel for
273+
for (T i = 0; i < n; ++i) {
274+
f(i);
275+
}
276+
}
277+
278+
template <typename L>
279+
AMREX_ATTRIBUTE_FLATTEN_FOR
280+
void ParallelForOMP (Box const& box, L const& f) noexcept
281+
{
282+
auto lo = amrex::lbound(box);
283+
auto hi = amrex::ubound(box);
284+
#if (AMREX_SPACEDIM == 1)
285+
#pragma omp parallel for
286+
for (int i = lo.x; i <= hi.x; ++i) {
287+
f(i,0,0);
288+
}
289+
#elif (AMREX_SPACEDIM == 2)
290+
#pragma omp parallel for
291+
for (int j = lo.y; j <= hi.y; ++j) {
292+
AMREX_PRAGMA_SIMD
293+
for (int i = lo.x; i <= hi.x; ++i) {
294+
f(i,j,0);
295+
}
296+
}
297+
#else
298+
#pragma omp parallel for collapse(2)
299+
for (int k = lo.z; k <= hi.z; ++k) {
300+
for (int j = lo.y; j <= hi.y; ++j) {
301+
AMREX_PRAGMA_SIMD
302+
for (int i = lo.x; i <= hi.x; ++i) {
303+
f(i,j,k);
304+
}
305+
}
306+
}
307+
#endif
308+
}
309+
310+
template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
311+
AMREX_ATTRIBUTE_FLATTEN_FOR
312+
void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
313+
{
314+
auto lo = amrex::lbound(box);
315+
auto hi = amrex::ubound(box);
316+
#if (AMREX_SPACEDIM == 1)
317+
#pragma omp parallel for collapse(2)
318+
for (T n = 0; n < ncomp; ++n) {
319+
AMREX_PRAGMA_SIMD
320+
for (int i = lo.x; i <= hi.x; ++i) {
321+
f(i,0,0,n);
322+
}
323+
}
324+
#elif (AMREX_SPACEDIM == 2)
325+
#pragma omp parallel for collapse(2)
326+
for (T n = 0; n < ncomp; ++n) {
327+
for (int j = lo.y; j <= hi.y; ++j) {
328+
AMREX_PRAGMA_SIMD
329+
for (int i = lo.x; i <= hi.x; ++i) {
330+
f(i,j,0,n);
331+
}
332+
}
333+
}
334+
#else
335+
#pragma omp parallel for collapse(3)
336+
for (T n = 0; n < ncomp; ++n) {
337+
for (int k = lo.z; k <= hi.z; ++k) {
338+
for (int j = lo.y; j <= hi.y; ++j) {
339+
AMREX_PRAGMA_SIMD
340+
for (int i = lo.x; i <= hi.x; ++i) {
341+
f(i,j,k,n);
342+
}
343+
}
344+
}
345+
}
346+
#endif
347+
}
348+
349+
#endif
350+
351+
}
352+
244353
#endif

0 commit comments

Comments
 (0)