Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions components/omega/doc/devGuide/ParallelLoops.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ The following inner iteration patterns are supported in Omega:
- `parallelForInner`
- `parallelReduceInner`
- `parallelScanInner`
- `parallelSearchInner`

To provide even more flexibility, the outer loops support iterating over a multi-dimensional range.
Currently, the inner loops are limited to one dimension.
Expand Down Expand Up @@ -191,7 +192,7 @@ a 3D array in parallel using hierarchical parallelism.
Array3DReal A("A", N1, N2, N3);
parallelForOuter(
{N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
parallelForInner(Team, N3, INNER_LAMBDA(Int J3) {
parallelForInner(Team, N3, INNER_LAMBDA(int J3) {
A(J1, J2, J3) = J1 + J2 + J3;
});
});
Expand All @@ -203,7 +204,7 @@ diagonal of a square matrix one can do:
Array2DReal M("M", N, N);
parallelForOuter(
{N}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
parallelForInner(Team, J1, INNER_LAMBDA(Int J2) {
parallelForInner(Team, J1, INNER_LAMBDA(int J2) {
M(J1, J2) = J1 + J2;
});
});
Expand All @@ -219,7 +220,7 @@ in a 2D array might be done as follows.
parallelForOuter(
{N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
Real SumD3;
parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &Accum) {
parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum) {
Accum += A(J1, J2, J3);
}, SumD3);
B(J1, J2) = SumD3;
Expand All @@ -233,10 +234,10 @@ For example, to additionally compute and store maxima along the third dimension
parallelForOuter(
{N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
Real SumD3, MaxD3;
parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &AccumSum, Real &AccumMax) {
parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &AccumSum, Real &AccumMax) {
AccumSum += A(J1, J2, J3);
AccumMax = Kokkos::Max(AccumMax, A(J1, J2, J3));
}, SumN3, MaxN3);
AccumMax = Kokkos::max(AccumMax, A(J1, J2, J3));
}, SumD3, Kokkos::Max<Real>(MaxD3));
B(J1, J2) = SumD3;
C(J1, J2) = MaxD3;
});
Expand All @@ -253,7 +254,7 @@ be done as follows.
Array3DReal D("D", N1, N2, N3);
parallelForOuter(
{N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) {
parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) {
Accum += A(J1, J2, J3);
if (IsFinal) {
D(J1, J2, J3) = Accum;
Expand All @@ -266,7 +267,7 @@ before the `if` statement. That is, it performs an inclusive scan. To compute an
simply move the addition after the `if` statement.
```c++
Real FinalScanValue;
parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) {
parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) {
if (IsFinal) {
D(J1, J2, J3) = Accum;
}
Expand All @@ -277,3 +278,26 @@ Moreover, this example illustrates that the final scan value can be obtained by
an additional argument `FinalScanValue`. Labels are not supported by `parallelScanInner`
and only one-dimensional index range can be used. In contrast to `parallelReduceInner`,
`parallelScanInner` supports only sum-based scans and only one scan variable.

### parallelSearchInner
To search an index range in parallel for the first index at which a given condition occurs,
Omega provides the `parallelSearchInner` function.
For example, the following code finds, for each row of a matrix, the first column index where
the matrix element is above a certain threshold. If no element matches the condition then
`parallelSearchInner` returns `-1`.
```c++
Array2DReal M("M", N1, N2);
Array1DI4 ThresholdIdx("ThresholdIdx", N1);
const Real Threshold = 0.5;
parallelForOuter(
{N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {

int Idx;
parallelSearchInner(Team, N2, INNER_LAMBDA(int J2) {
return M(J1, J2) > Threshold;
}, Idx);

ThresholdIdx(J1) = Idx;
});
```
Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used.
38 changes: 38 additions & 0 deletions components/omega/src/infra/OmegaKokkos.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,44 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
std::forward<R>(Reducers)...);
}

// parallelSearchInner
// Given a functor taking an index and returning a bool this function
// returns the first index in the range [0, UpperBound) for which the input
// functor returns true. If no such index is found it returns -1
template <class F>
KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
F &&Functor, int &Idx) {
static_assert(std::is_same_v<std::invoke_result_t<F, int>, bool>,
"parallelSearchInner requires a functor that takes an int and "
"returns bool");

// There are different implementations for host and device since the
// parallel_reduce version doesn't return early leading to performance loss
// on CPUs
#ifndef OMEGA_TARGET_DEVICE
Idx = -1;
for (int I = 0; I < UpperBound; ++I) {
if (Functor(I)) {
Idx = I;
break;
}
}
#else
const auto Policy = TeamThreadRange(Team, UpperBound);
Kokkos::parallel_reduce(
Policy,
INNER_LAMBDA(int I, int &Accum) {
if (I <= Accum && Functor(I)) {
Accum = I;
}
},
Kokkos::Min<int>(Idx));
if (Idx == Kokkos::reduction_identity<int>::min()) {
Idx = -1;
}
#endif
}

} // end namespace OMEGA

//===----------------------------------------------------------------------===//
Expand Down
105 changes: 105 additions & 0 deletions components/omega/test/infra/OmegaKokkosHiParTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,110 @@ Error testHiparReduce1DReduce1D(int N1) {
return Err;
}

Error testHiparFor1DSearch1D(int N2) {
Error Err;

const int Threshold = N2 / 2;
const int N1 = 3 * N2 + 3;

HostArray2DI4 DataH("DataH", N1, N2);

for (int J1 = 0; J1 < 3 * N2; ++J1) {
if (J1 < N2 + 10) {
for (int J2 = 0; J2 < N2; ++J2) {
DataH(J1, J2) = Threshold - (J1 - J2);
}
} else {
for (int J2 = 0; J2 < N2; ++J2) {
DataH(J1, J2) = Threshold - (J1 / 4 - J2);
}
}
}

// Ensure these patterns are in the input data
for (int J2 = 0; J2 < N2; ++J2) {
// Everything above threshold
DataH(3 * N2, J2) = Threshold + 1;
// Everything below threshold
DataH(3 * N2 + 1, J2) = Threshold - 1;
// Multiple non-consecutive values above threshold
DataH(3 * N2 + 2, J2) = Threshold - 3 + J2 % 4;
}

auto DataD = createDeviceMirrorCopy(DataH);

HostArray1DI4 RefIdxH("RefIdxH", N1);
Array1DI4 IdxD("IdxD", N1);

// test searching full range

for (int J1 = 0; J1 < N1; ++J1) {
int Idx = -1;
for (int J2 = 0; J2 < N2; ++J2) {
if (DataH(J1, J2) >= Threshold) {
Idx = J2;
break;
}
}
RefIdxH(J1) = Idx;
}

parallelForOuter(
{N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
parallelSearchInner(
Team, N2,
INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; },
IdxD(J1));
});

if (!arraysEqual(IdxD, RefIdxH)) {
Err += Error(ErrorCode::Fail,
errorMsg("parallelFor1DSearch1D Full FAIL", N1));
}

deepCopy(RefIdxH, 0);
deepCopy(IdxD, 0);

// test searching limited range

if (N2 / 4 > 0) {

for (int J1 = 0; J1 < N1; ++J1) {
int Idx = -1;
const int Start = N2 / 4 - J1 % (N2 / 4);
const int End = 3 * N2 / 4 + J1 % (N2 / 4);
for (int J2 = Start; J2 < End; ++J2) {
if (DataH(J1, J2) >= Threshold) {
Idx = J2;
break;
}
}
RefIdxH(J1) = Idx;
}

parallelForOuter(
{N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
const int Start = N2 / 4 - J1 % (N2 / 4);
const int End = 3 * N2 / 4 + J1 % (N2 / 4);
int SearchIdx;
parallelSearchInner(
Team, End - Start,
INNER_LAMBDA(int J2) {
return DataD(J1, J2 + Start) >= Threshold;
},
SearchIdx);
IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start;
});

if (!arraysEqual(IdxD, RefIdxH)) {
Err += Error(ErrorCode::Fail,
errorMsg("parallelFor1DSearch1D Limited FAIL", N1));
}
}

return Err;
}

Error testHiparFor1DMultiple1D(int N1, int N2) {
Error Err;

Expand Down Expand Up @@ -687,6 +791,7 @@ int main(int argc, char **argv) {
#if !defined(KOKKOS_ENABLE_SYCL) || KOKKOS_VERSION_GREATER_EQUAL(4, 7, 1)
Err += testHiparReduce1DReduce1D(N1);
#endif
Err += testHiparFor1DSearch1D(N1);

Err += testHiparFor1DMultiple1D(1, N1);
Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1);
Expand Down
Loading