Implement parallel cuda::std::adjacent_difference (#7880)

miscco · web-flow · commit 35731cbd40e6 · 2026-03-05T08:15:01.000+01:00
This implements the `adjacent_difference` algorithm for the cuda backend. * std::adjacent_difference see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7753
diff --git a/libcudacxx/benchmarks/bench/adjacent_difference/basic.cu b/libcudacxx/benchmarks/bench/adjacent_difference/basic.cu
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> out(elements);
+  thrust::device_vector<T> in = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc;
+  state.exec(
+    nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      do_not_optimize(cuda::std::adjacent_difference(cuda_policy(alloc, launch), in.cbegin(), in.cend(), out.begin()));
+    });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
+
+template <typename T>
+static void with_comp(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> out(elements);
+  thrust::device_vector<T> in = generate(elements);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  caching_allocator_t alloc;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::adjacent_difference(
+                 cuda_policy(alloc, launch), in.cbegin(), in.cend(), out.begin(), ::cuda::std::greater<T>{}));
+             });
+}
+
+NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("with_comp")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
diff --git a/libcudacxx/benchmarks/bench/inclusive_scan/max.cu b/libcudacxx/benchmarks/bench/inclusive_scan/max.cu
@@ -16,11 +16,6 @@
 
 #include "nvbench_helper.cuh"
 
-NVBENCH_BENCH_TYPES(range_iter, NVBENCH_TYPE_AXES(fundamental_types))
-  .set_name("range_iter")
-  .set_type_axes_names({"T{ct}"})
-  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
-
 template <typename T>
 static void range_iter_op(nvbench::state& state, nvbench::type_list<T>)
 {
diff --git a/libcudacxx/include/cuda/std/__pstl/adjacent_difference.h b/libcudacxx/include/cuda/std/__pstl/adjacent_difference.h
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
+#define _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__numeric/adjacent_difference.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/adjacent_difference.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp = ::cuda::std::minus<>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
+                 is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _OutputIterator adjacent_difference(
+  [[maybe_unused]] const _Policy& __policy,
+  _InputIterator __first,
+  _InputIterator __last,
+  _OutputIterator __result,
+  _BinaryOp __binary_op = {})
+{
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__adjacent_difference,
+                                                   _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::adjacent_difference");
+
+    if (__first == __last)
+    {
+      return __result;
+    }
+
+    return __dispatch(
+      __policy,
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result),
+      ::cuda::std::move(__binary_op));
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::adjacent_difference requires at least one selected backend");
+    return ::cuda::std::adjacent_difference(
+      ::cuda::std::move(__first),
+      ::cuda::std::move(__last),
+      ::cuda::std::move(__result),
+      ::cuda::std::move(__binary_op));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_ADJACENT_DIFFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/adjacent_difference.h b/libcudacxx/include/cuda/std/__pstl/cuda/adjacent_difference.h
@@ -0,0 +1,162 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H
+#define _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+
+#  include <cub/device/device_adjacent_difference.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__functional/call_or.h>
+#  include <cuda/__memory_pool/device_memory_pool.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__numeric/adjacent_difference.h>
+#  include <cuda/std/__pstl/cuda/temporary_storage.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__adjacent_difference, __execution_backend::__cuda>
+{
+  template <class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp>
+  [[nodiscard]] _CCCL_HOST_API static _OutputIterator __par_impl(
+    const _Policy& __policy,
+    _InputIterator __first,
+    _InputIterator __last,
+    _OutputIterator __result,
+    _BinaryOp __binary_op)
+  {
+    auto __count = ::cuda::std::distance(__first, __last);
+    auto __ret   = __result + __count;
+
+    // Determine temporary device storage requirements for device_merge
+    size_t __num_bytes = 0;
+    _CCCL_TRY_CUDA_API(
+      ::cub::DeviceAdjacentDifference::SubtractLeftCopy,
+      "__pstl_cuda_merge: determination of device storage for cub::DeviceAdjacentDifference::SubtractLeftCopy failed",
+      static_cast<void*>(nullptr),
+      __num_bytes,
+      __first,
+      __result,
+      __count,
+      __binary_op,
+      0);
+
+    // Allocate memory for result
+    auto __stream   = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    auto __resource = ::cuda::__call_or(
+      ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
+    {
+      __temporary_storage<void, decltype(__resource)> __storage{__stream, __resource, __num_bytes};
+
+      // Run the kernel, the standard requires that the input and output range do not overlap
+      _CCCL_TRY_CUDA_API(
+        ::cub::DeviceAdjacentDifference::SubtractLeftCopy,
+        "__pstl_cuda_merge: kernel launch of cub::DeviceAdjacentDifference::SubtractLeftCopy failed",
+        __storage.__get_temp_storage(),
+        __num_bytes,
+        ::cuda::std::move(__first),
+        ::cuda::std::move(__result),
+        __count,
+        ::cuda::std::move(__binary_op),
+        __stream.get());
+    }
+
+    return __ret;
+  }
+
+  _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryOp)
+  _CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator>)
+  [[nodiscard]] _CCCL_HOST_API _OutputIterator operator()(
+    [[maybe_unused]] const _Policy& __policy,
+    _InputIterator __first,
+    _InputIterator __last,
+    _OutputIterator __result,
+    _BinaryOp __binary_op) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>
+                  && ::cuda::std::__has_random_access_traversal<_OutputIterator>)
+    {
+      try
+      {
+        return __par_impl(
+          __policy,
+          ::cuda::std::move(__first),
+          ::cuda::std::move(__last),
+          ::cuda::std::move(__result),
+          ::cuda::std::move(__binary_op));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == cudaErrorMemoryAllocation)
+        {
+          _CCCL_THROW(::std::bad_alloc);
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>, "CUDA backend of cuda::std::merge requires random access iterators");
+      return ::cuda::std::adjacent_difference(
+        ::cuda::std::move(__first),
+        ::cuda::std::move(__last),
+        ::cuda::std::move(__result),
+        ::cuda::std::move(__binary_op));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_ADJACENT_DIFFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__pstl/dispatch.h b/libcudacxx/include/cuda/std/__pstl/dispatch.h
@@ -32,6 +32,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
 
 enum class __pstl_algorithm
 {
+  __adjacent_difference,
   __copy_if,
   __copy_n,
   __exclusive_scan,
diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__pstl/adjacent_difference.h>
 #include <cuda/std/__pstl/all_of.h>
 #include <cuda/std/__pstl/any_of.h>
 #include <cuda/std/__pstl/copy.h>
diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/adjacent.difference/pstl_adjacent_difference.cu b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/adjacent.difference/pstl_adjacent_difference.cu
diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/adjacent.difference/pstl_adjacent_difference_comp.cu b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/adjacent.difference/pstl_adjacent_difference_comp.cu

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION`
`32`	`32`
`33`	`33`	`enum class __pstl_algorithm`
`34`	`34`	`{`
	`35`	`+ __adjacent_difference,`
`35`	`36`	`__copy_if,`
`36`	`37`	`__copy_n,`
`37`	`38`	`__exclusive_scan,`