Skip to content
872 changes: 549 additions & 323 deletions sycl/include/sycl/reduction.hpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions sycl/test-e2e/Reduction/reduction_big_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {

// Initialize.
BinaryOperation BOp;
T CorrectOut;
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
T CorrectOut = *CorrectOutOpt;

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/Reduction/reduction_ctor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void test_reducer(Reduction &Redu, T A, T B) {

typename Reduction::binary_operation BOp;
T ExpectedValue = BOp(A, B);
assert(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0) &&
assert(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0) &&
"Wrong result of binary operation.");
assert(
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
Expand All @@ -40,7 +40,7 @@ void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) {

T ExpectedValue = BOp(A, B);
assert(
toBool(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0)) &&
toBool(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0)) &&
"Wrong result of binary operation.");
assert(
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
Expand Down
4 changes: 3 additions & 1 deletion sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
66 changes: 66 additions & 0 deletions sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//
// Group algorithms are not supported on Nvidia.
// XFAIL: hip_nvidia

// This test performs basic checks of parallel_for(nd_range, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
size_t NWItems) {
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);

// Check some non power-of-two work-group sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);

// Try some power-of-two work-group sizes.
tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, 64, 256);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 128, 256);
tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);

// Check with various types.
tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);

// Check with CUSTOM type.
using CV = CustomVec<long long>;
tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);

printFinalStatus(NumErrors);
return NumErrors;
}
65 changes: 65 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

constexpr access::mode RW = access::mode::read_write;
// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));

// Check with CUSTOM type.
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(256));
tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(MaxWGSize * 3));
tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<1>(72));

// Check with identityless operations.
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2));
tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2 + 5));

printFinalStatus(NumErrors);
return NumErrors;
}
69 changes: 69 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<2>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
range<2>{MaxWGSize, 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{1, MaxWGSize});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{2, MaxWGSize * 2});
tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{MaxWGSize * 3, 7});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{3, MaxWGSize * 3});

printFinalStatus(NumErrors);
return NumErrors;
}
23 changes: 21 additions & 2 deletions sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,12 @@ int main() {
tests<class D2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
tests<class D3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});

/* Temporarily disabled
tests<class D4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class D5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class D6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});
*/

tests<class D7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
Expand All @@ -83,6 +81,27 @@ int main() {
tests<class D9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

tests<class E1, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{1, 1, 1});
tests<class E2, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{2, 2, 2});
tests<class E3, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{2, 3, 4});

tests<class E4, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class E5, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class E6, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class E7, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class E8, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class E9, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

printFinalStatus(NumErrors);
return NumErrors;
}
82 changes: 82 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<3>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});

tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize * 3, 8, 4});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});

tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

printFinalStatus(NumErrors);
return NumErrors;
}
4 changes: 3 additions & 1 deletion sycl/test-e2e/Reduction/reduction_range_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
6 changes: 3 additions & 3 deletions sycl/test-e2e/Reduction/reduction_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
}

// Initialize.
T CorrectOut;
std::optional<T> CorrectOutOpt;
BinaryOperation BOp;

buffer<T, 1> InBuf(NWItems);
initInputData(InBuf, CorrectOut, BOp, NWItems);
CorrectOut = BOp(CorrectOut, Init);
initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
T CorrectOut = BOp(*CorrectOutOpt, Init);

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
Loading