Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions docs/libcudacxx/standard_api/algorithms_library.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Algorithms Library
* - `\<cuda/std/algorithm\> <https://en.cppreference.com/w/cpp/header/algorithm>`_
- Fundamental library algorithms
- CCCL 3.2.0 / CUDA 13.2
* - `\<cuda/std/execution\> <https://en.cppreference.com/w/cpp/header/execution>`_
- Standard parallel algorithms
- CCCL 3.4.0 / CUDA 13.4

Extensions
----------
Expand All @@ -31,3 +34,57 @@ Restrictions
* ``sort``
* ``stable_partiin``
* ``stable_sort``

Parallel standard algorithms
----------------------------

CCCL provides an implementation for the standard `parallel algorithms library <http://www.eel.is/c++draft/algorithms.parallel>`_

Currently the CUDA backend is the only supported backend. It can be selected by passing the `cuda::execution::gpu`
execution policy to one of the supported algorithms. The CUDA backend requires the passed in sequences to reside in
device accessible memory and the iterators into those sequences to be at least random access iterators. The CUDA backend
is enabled if the program im compiled with a CUDA compile in CUDA mode.

The use of any other execution policy is currently not supported and results in a compile time error.

The following algorithms are supported:

* ``adjacent_difference``
* ``adjacent_find``
* ``all_of``
* ``any_of``
* ``copy``
* ``copy_if``
* ``copy_n``
* ``count``
* ``count_if``
* ``equal``
* ``exclusive_scan``
* ``fill``
* ``fill_n``
* ``find``
* ``find_if``
* ``find_if_not``
* ``for_each``
* ``for_each_n``
* ``generate``
* ``generate_n``
* ``inclusive_scan``
* ``merge``
* ``mismatch``
* ``none_of``
* ``reduce``
* ``remove``
* ``remove_copy``
* ``remove_copy_if``
* ``remove_if``
* ``replace``
* ``replace_copy``
* ``replace_copy_if``
* ``replace_if``
* ``reverse``
* ``reverse_copy``
* ``transform``
* ``transform_reduce``
* ``unique``
* ``unique_copy``
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/adjacent_difference/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
3 changes: 1 addition & 2 deletions libcudacxx/benchmarks/bench/adjacent_find/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
#include <thrust/sequence.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/benchmarks/bench/all_of/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

#include <cuda/functional>
#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand All @@ -27,8 +27,8 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/benchmarks/bench/any_of/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

#include <cuda/functional>
#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand All @@ -27,8 +27,8 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/copy/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/copy_if/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/complex>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/copy_n/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/count/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/count_if/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
10 changes: 5 additions & 5 deletions libcudacxx/benchmarks/bench/equal/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

#include <cuda/iterator>
#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand All @@ -27,8 +27,8 @@ static void range_iter(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down Expand Up @@ -57,8 +57,8 @@ static void range_range(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/exclusive_scan/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/exclusive_scan/max.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/fill/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/fill_n/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/benchmarks/bench/find/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand All @@ -26,8 +26,8 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/benchmarks/bench/find_if/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

#include <cuda/functional>
#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand All @@ -27,8 +27,8 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/benchmarks/bench/find_if_not/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down Expand Up @@ -41,8 +41,8 @@ static void basic(nvbench::state& state, nvbench::type_list<T>)
const auto mismatch_point = static_cast<std::size_t>(elements * common_prefix);

thrust::device_vector<T> dinput(elements, thrust::no_init);
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::__cub_par_unseq, dinput.begin() + mismatch_point, dinput.end(), val);
cuda::std::fill(cuda::execution::gpu, dinput.begin(), dinput.begin() + mismatch_point, T{0});
cuda::std::fill(cuda::execution::gpu, dinput.begin() + mismatch_point, dinput.end(), val);

state.add_global_memory_reads<T>(mismatch_point + 1);
state.add_global_memory_writes<size_t>(1);
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/for_each/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/for_each_n/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/generate/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/generate_n/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/inclusive_scan/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/inclusive_scan/max.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/is_sorted/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <thrust/sort.h>

#include <cuda/functional>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/is_sorted_until/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <thrust/sort.h>

#include <cuda/functional>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
2 changes: 1 addition & 1 deletion libcudacxx/benchmarks/bench/merge/basic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include <thrust/sort.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"
Expand Down
Loading
Loading