Skip to content

Commit 35731cb

Browse files
authored
Implement parallel cuda::std::adjacent_difference (#7880)
This implements the `adjacent_difference` algorithm for the cuda backend. * std::adjacent_difference see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7753
1 parent 612ccf2 commit 35731cb

File tree

8 files changed

+543
-5
lines changed

8 files changed

+543
-5
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/adjacent_difference.h>
12+
#include <thrust/device_vector.h>
13+
14+
#include <cuda/memory_pool>
15+
#include <cuda/std/__pstl_algorithm>
16+
#include <cuda/stream>
17+
18+
#include "nvbench_helper.cuh"
19+
20+
template <typename T>
21+
static void basic(nvbench::state& state, nvbench::type_list<T>)
22+
{
23+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
24+
25+
thrust::device_vector<T> out(elements);
26+
thrust::device_vector<T> in = generate(elements);
27+
28+
state.add_element_count(elements);
29+
state.add_global_memory_reads<T>(elements);
30+
state.add_global_memory_writes<T>(elements);
31+
32+
caching_allocator_t alloc;
33+
state.exec(
34+
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
35+
do_not_optimize(cuda::std::adjacent_difference(cuda_policy(alloc, launch), in.cbegin(), in.cend(), out.begin()));
36+
});
37+
}
38+
39+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
40+
.set_name("base")
41+
.set_type_axes_names({"T{ct}"})
42+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
43+
44+
template <typename T>
45+
static void with_comp(nvbench::state& state, nvbench::type_list<T>)
46+
{
47+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
48+
49+
thrust::device_vector<T> out(elements);
50+
thrust::device_vector<T> in = generate(elements);
51+
52+
state.add_element_count(elements);
53+
state.add_global_memory_reads<T>(elements);
54+
state.add_global_memory_writes<T>(elements);
55+
56+
caching_allocator_t alloc;
57+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
58+
[&](nvbench::launch& launch) {
59+
do_not_optimize(cuda::std::adjacent_difference(
60+
cuda_policy(alloc, launch), in.cbegin(), in.cend(), out.begin(), ::cuda::std::greater<T>{}));
61+
});
62+
}
63+
64+
NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
65+
.set_name("with_comp")
66+
.set_type_axes_names({"T{ct}"})
67+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));

libcudacxx/benchmarks/bench/inclusive_scan/max.cu

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@
1616

1717
#include "nvbench_helper.cuh"
1818

19-
NVBENCH_BENCH_TYPES(range_iter, NVBENCH_TYPE_AXES(fundamental_types))
20-
.set_name("range_iter")
21-
.set_type_axes_names({"T{ct}"})
22-
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
23-
2419
template <typename T>
2520
static void range_iter_op(nvbench::state& state, nvbench::type_list<T>)
2621
{
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
12+
#define _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__nvtx/nvtx.h>
27+
# include <cuda/std/__concepts/concept_macros.h>
28+
# include <cuda/std/__execution/policy.h>
29+
# include <cuda/std/__functional/operations.h>
30+
# include <cuda/std/__iterator/concepts.h>
31+
# include <cuda/std/__iterator/distance.h>
32+
# include <cuda/std/__numeric/adjacent_difference.h>
33+
# include <cuda/std/__pstl/dispatch.h>
34+
# include <cuda/std/__type_traits/always_false.h>
35+
# include <cuda/std/__type_traits/is_execution_policy.h>
36+
# include <cuda/std/__utility/move.h>
37+
38+
# if _CCCL_HAS_BACKEND_CUDA()
39+
# include <cuda/std/__pstl/cuda/adjacent_difference.h>
40+
# endif // _CCCL_HAS_BACKEND_CUDA()
41+
42+
# include <cuda/std/__cccl/prologue.h>
43+
44+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
45+
46+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
47+
48+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp = ::cuda::std::minus<>)
49+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
50+
is_execution_policy_v<_Policy>)
51+
_CCCL_HOST_API _OutputIterator adjacent_difference(
52+
[[maybe_unused]] const _Policy& __policy,
53+
_InputIterator __first,
54+
_InputIterator __last,
55+
_OutputIterator __result,
56+
_BinaryOp __binary_op = {})
57+
{
58+
[[maybe_unused]] auto __dispatch =
59+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__adjacent_difference,
60+
_Policy>();
61+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
62+
{
63+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::adjacent_difference");
64+
65+
if (__first == __last)
66+
{
67+
return __result;
68+
}
69+
70+
return __dispatch(
71+
__policy,
72+
::cuda::std::move(__first),
73+
::cuda::std::move(__last),
74+
::cuda::std::move(__result),
75+
::cuda::std::move(__binary_op));
76+
}
77+
else
78+
{
79+
static_assert(__always_false_v<_Policy>,
80+
"Parallel cuda::std::adjacent_difference requires at least one selected backend");
81+
return ::cuda::std::adjacent_difference(
82+
::cuda::std::move(__first),
83+
::cuda::std::move(__last),
84+
::cuda::std::move(__result),
85+
::cuda::std::move(__binary_op));
86+
}
87+
}
88+
89+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
90+
91+
_CCCL_END_NAMESPACE_CUDA_STD
92+
93+
# include <cuda/std/__cccl/epilogue.h>
94+
95+
#endif // !_CCCL_COMPILER(NVRTC)
96+
97+
#endif // _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H
12+
#define _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if _CCCL_HAS_BACKEND_CUDA()
25+
26+
_CCCL_DIAG_PUSH
27+
_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
28+
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
29+
_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
30+
_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
31+
32+
# include <cub/device/device_adjacent_difference.cuh>
33+
34+
_CCCL_DIAG_POP
35+
36+
# include <cuda/__execution/policy.h>
37+
# include <cuda/__functional/call_or.h>
38+
# include <cuda/__memory_pool/device_memory_pool.h>
39+
# include <cuda/__memory_resource/get_memory_resource.h>
40+
# include <cuda/__stream/get_stream.h>
41+
# include <cuda/__stream/stream_ref.h>
42+
# include <cuda/std/__exception/cuda_error.h>
43+
# include <cuda/std/__exception/exception_macros.h>
44+
# include <cuda/std/__execution/env.h>
45+
# include <cuda/std/__execution/policy.h>
46+
# include <cuda/std/__iterator/iterator_traits.h>
47+
# include <cuda/std/__numeric/adjacent_difference.h>
48+
# include <cuda/std/__pstl/cuda/temporary_storage.h>
49+
# include <cuda/std/__pstl/dispatch.h>
50+
# include <cuda/std/__type_traits/always_false.h>
51+
# include <cuda/std/__utility/move.h>
52+
53+
# include <cuda/std/__cccl/prologue.h>
54+
55+
_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
56+
57+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
58+
59+
template <>
60+
struct __pstl_dispatch<__pstl_algorithm::__adjacent_difference, __execution_backend::__cuda>
61+
{
62+
template <class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp>
63+
[[nodiscard]] _CCCL_HOST_API static _OutputIterator __par_impl(
64+
const _Policy& __policy,
65+
_InputIterator __first,
66+
_InputIterator __last,
67+
_OutputIterator __result,
68+
_BinaryOp __binary_op)
69+
{
70+
auto __count = ::cuda::std::distance(__first, __last);
71+
auto __ret = __result + __count;
72+
73+
// Determine temporary device storage requirements for device_merge
74+
size_t __num_bytes = 0;
75+
_CCCL_TRY_CUDA_API(
76+
::cub::DeviceAdjacentDifference::SubtractLeftCopy,
77+
"__pstl_cuda_merge: determination of device storage for cub::DeviceAdjacentDifference::SubtractLeftCopy failed",
78+
static_cast<void*>(nullptr),
79+
__num_bytes,
80+
__first,
81+
__result,
82+
__count,
83+
__binary_op,
84+
0);
85+
86+
// Allocate memory for result
87+
auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
88+
auto __resource = ::cuda::__call_or(
89+
::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
90+
{
91+
__temporary_storage<void, decltype(__resource)> __storage{__stream, __resource, __num_bytes};
92+
93+
// Run the kernel, the standard requires that the input and output range do not overlap
94+
_CCCL_TRY_CUDA_API(
95+
::cub::DeviceAdjacentDifference::SubtractLeftCopy,
96+
"__pstl_cuda_merge: kernel launch of cub::DeviceAdjacentDifference::SubtractLeftCopy failed",
97+
__storage.__get_temp_storage(),
98+
__num_bytes,
99+
::cuda::std::move(__first),
100+
::cuda::std::move(__result),
101+
__count,
102+
::cuda::std::move(__binary_op),
103+
__stream.get());
104+
}
105+
106+
return __ret;
107+
}
108+
109+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp)
110+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator>)
111+
[[nodiscard]] _CCCL_HOST_API _OutputIterator operator()(
112+
[[maybe_unused]] const _Policy& __policy,
113+
_InputIterator __first,
114+
_InputIterator __last,
115+
_OutputIterator __result,
116+
_BinaryOp __binary_op) const
117+
{
118+
if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>
119+
&& ::cuda::std::__has_random_access_traversal<_OutputIterator>)
120+
{
121+
try
122+
{
123+
return __par_impl(
124+
__policy,
125+
::cuda::std::move(__first),
126+
::cuda::std::move(__last),
127+
::cuda::std::move(__result),
128+
::cuda::std::move(__binary_op));
129+
}
130+
catch (const ::cuda::cuda_error& __err)
131+
{
132+
if (__err.status() == cudaErrorMemoryAllocation)
133+
{
134+
_CCCL_THROW(::std::bad_alloc);
135+
}
136+
else
137+
{
138+
throw __err;
139+
}
140+
}
141+
}
142+
else
143+
{
144+
static_assert(__always_false_v<_Policy>, "CUDA backend of cuda::std::merge requires random access iterators");
145+
return ::cuda::std::adjacent_difference(
146+
::cuda::std::move(__first),
147+
::cuda::std::move(__last),
148+
::cuda::std::move(__result),
149+
::cuda::std::move(__binary_op));
150+
}
151+
}
152+
};
153+
154+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
155+
156+
_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
157+
158+
# include <cuda/std/__cccl/epilogue.h>
159+
160+
#endif // _CCCL_HAS_BACKEND_CUDA()
161+
162+
#endif // _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H

libcudacxx/include/cuda/std/__pstl/dispatch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
3232

3333
enum class __pstl_algorithm
3434
{
35+
__adjacent_difference,
3536
__copy_if,
3637
__copy_n,
3738
__exclusive_scan,

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
# pragma system_header
2222
#endif // no system header
2323

24+
#include <cuda/std/__pstl/adjacent_difference.h>
2425
#include <cuda/std/__pstl/all_of.h>
2526
#include <cuda/std/__pstl/any_of.h>
2627
#include <cuda/std/__pstl/copy.h>

0 commit comments

Comments
 (0)