Skip to content

Commit 56246c6

Browse files
committed
Update detail memcpy_async to return CUDA error
1 parent d735479 commit 56246c6

File tree

4 files changed

+34
-37
lines changed

4 files changed

+34
-37
lines changed

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include <cuco/detail/error.hpp>
2020
#include <cuco/detail/hyperloglog/finalizer.cuh>
2121
#include <cuco/detail/hyperloglog/kernels.cuh>
22-
#include <cuco/detail/utility/memcpy_async.cuh>
22+
#include <cuco/detail/utility/memcpy_async.hpp>
2323
#include <cuco/detail/utils.hpp>
2424
#include <cuco/hash_functions.cuh>
2525
#include <cuco/types.cuh>
@@ -421,11 +421,11 @@ class hyperloglog_impl {
421421
std::vector<register_type> host_sketch(num_regs);
422422

423423
// TODO check if storage is host accessible
424-
cuco::detail::memcpy_async(host_sketch.data(),
425-
this->sketch_.data(),
426-
sizeof(register_type) * num_regs,
427-
cudaMemcpyDefault,
428-
stream);
424+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(host_sketch.data(),
425+
this->sketch_.data(),
426+
sizeof(register_type) * num_regs,
427+
cudaMemcpyDefault,
428+
stream));
429429
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
430430
stream.sync();
431431
#else

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#include <cuco/detail/open_addressing/kernels.cuh>
2222
#include <cuco/detail/storage/counter_storage.cuh>
2323
#include <cuco/detail/utility/cuda.hpp>
24-
#include <cuco/detail/utility/memcpy_async.cuh>
24+
#include <cuco/detail/utility/memcpy_async.hpp>
2525
#include <cuco/detail/utils.hpp>
2626
#include <cuco/extent.cuh>
2727
#include <cuco/operator.hpp>
@@ -883,8 +883,8 @@ class open_addressing_impl {
883883
stream.get()));
884884

885885
size_type temp_count;
886-
cuco::detail::memcpy_async(
887-
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream);
886+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
887+
&temp_count, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
888888
#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
889889
stream.sync();
890890
#else

include/cuco/detail/static_map.inl

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
#include <cuco/detail/bitwise_compare.cuh>
1818
#include <cuco/detail/error.hpp>
19-
#include <cuco/detail/utility/memcpy_async.cuh>
19+
#include <cuco/detail/utility/memcpy_async.hpp>
2020
#include <cuco/detail/utils.cuh>
2121
#include <cuco/detail/utils.hpp>
2222

@@ -109,11 +109,11 @@ void static_map<Key, Value, Scope, Allocator>::insert(
109109

110110
detail::insert<block_size, tile_size>
111111
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
112-
cuco::detail::memcpy_async(&h_num_successes,
113-
num_successes_,
114-
sizeof(atomic_ctr_type),
115-
cudaMemcpyDeviceToHost,
116-
cuda::stream_ref{stream});
112+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
113+
num_successes_,
114+
sizeof(atomic_ctr_type),
115+
cudaMemcpyDeviceToHost,
116+
cuda::stream_ref{stream}));
117117

118118
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
119119

@@ -150,11 +150,11 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
150150

151151
detail::insert_if_n<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
152152
first, num_keys, num_successes_, view, stencil, pred, hash, key_equal);
153-
cuco::detail::memcpy_async(&h_num_successes,
154-
num_successes_,
155-
sizeof(atomic_ctr_type),
156-
cudaMemcpyDeviceToHost,
157-
cuda::stream_ref{stream});
153+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
154+
num_successes_,
155+
sizeof(atomic_ctr_type),
156+
cudaMemcpyDeviceToHost,
157+
cuda::stream_ref{stream}));
158158
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
159159

160160
size_ += h_num_successes;
@@ -185,11 +185,11 @@ void static_map<Key, Value, Scope, Allocator>::erase(
185185

186186
detail::erase<block_size, tile_size>
187187
<<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
188-
cuco::detail::memcpy_async(&h_num_successes,
189-
num_successes_,
190-
sizeof(atomic_ctr_type),
191-
cudaMemcpyDeviceToHost,
192-
cuda::stream_ref{stream});
188+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(&h_num_successes,
189+
num_successes_,
190+
sizeof(atomic_ctr_type),
191+
cudaMemcpyDeviceToHost,
192+
cuda::stream_ref{stream}));
193193

194194
CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); // stream sync to ensure h_num_successes is updated
195195

@@ -259,8 +259,8 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
259259
stream);
260260

261261
std::size_t h_num_out;
262-
cuco::detail::memcpy_async(
263-
&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream});
262+
CUCO_CUDA_TRY(cuco::detail::memcpy_async(
263+
&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, cuda::stream_ref{stream}));
264264
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
265265
temp_allocator.deallocate(
266266
reinterpret_cast<char*>(d_num_out), sizeof(std::size_t), cuda::stream_ref{stream});

include/cuco/detail/utility/memcpy_async.cuh

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,15 @@ namespace cuco::detail {
3333
* @param count Number of bytes to copy
3434
* @param kind Memory copy direction
3535
* @param stream CUDA stream for the operation
36+
* @return cudaError_t Error code from the memory copy operation
3637
*/
37-
inline void memcpy_async(
38+
[[nodiscard]] inline cudaError_t memcpy_async(
3839
void* dst, void const* src, size_t count, cudaMemcpyKind kind, cuda::stream_ref stream)
3940
{
40-
if (dst == nullptr || src == nullptr || count == 0) { return; }
41+
if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }
4142

4243
#if CUDART_VERSION >= 12080
43-
if (stream.get() == 0) {
44-
CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
45-
return;
46-
}
44+
if (stream.get() == 0) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
4745

4846
void* dsts[1] = {dst};
4947
void* srcs[1] = {const_cast<void*>(src)};
@@ -55,15 +53,14 @@ inline void memcpy_async(
5553
attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
5654

5755
#if CUDART_VERSION >= 13000
58-
CUCO_CUDA_TRY(cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get()));
56+
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get());
5957
#else
6058
std::size_t fail_idx;
61-
CUCO_CUDA_TRY(
62-
cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get()));
59+
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get());
6360
#endif // CUDART_VERSION >= 13000
6461
#else
6562
// CUDA < 12.8 - use regular cudaMemcpyAsync
66-
CUCO_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream.get()));
63+
return cudaMemcpyAsync(dst, src, count, kind, stream.get());
6764
#endif // CUDART_VERSION >= 12080
6865
}
6966
} // namespace cuco::detail

0 commit comments

Comments
 (0)