Skip to content

Commit 3e66fa1

Browse files
committed
Address review nits for DeviceSegmentedReduce env overloads
- Add missing non-overlap precondition to env ArgMin and ArgMax docs - Reorder env tests: group all env tests before custom stream tests - Add not_guaranteed determinism test for Reduce env API
1 parent bdc87eb commit 3e66fa1

File tree

3 files changed

+89
-55
lines changed

3 files changed

+89
-55
lines changed

cub/cub/device/device_segmented_reduce.cuh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2-
// SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
2+
// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved.
33
// SPDX-License-Identifier: BSD-3
44

55
//! @file
@@ -1284,6 +1284,11 @@ public:
12841284
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
12851285
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
12861286
//! the latter is specified as ``segment_offsets + 1``).
1287+
//! - Let ``s`` be in ``[0, num_segments)``. The range
1288+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
1289+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
1290+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
1291+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
12871292
//! - Can use a specific stream or cuda memory resource through the ``env`` parameter
12881293
//!
12891294
//! Snippet
@@ -1989,6 +1994,11 @@ public:
19891994
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
19901995
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
19911996
//! the latter is specified as ``segment_offsets + 1``).
1997+
//! - Let ``s`` be in ``[0, num_segments)``. The range
1998+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
1999+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
2000+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
2001+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
19922002
//! - Can use a specific stream or cuda memory resource through the ``env`` parameter
19932003
//!
19942004
//! Snippet

cub/test/catch2_test_device_segmented_reduce_env.cu

Lines changed: 54 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,60 @@ C2H_TEST("Device segmented max uses environment", "[segmented_reduce][device]")
238238
REQUIRE(d_out == expected);
239239
}
240240

241+
C2H_TEST("Device segmented argmin uses environment", "[segmented_reduce][device]")
242+
{
243+
int num_segments = 3;
244+
thrust::device_vector<int> d_offsets = {0, 4, 7, 9};
245+
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
246+
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9, 1, 2};
247+
thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
248+
249+
size_t expected_bytes_allocated{};
250+
REQUIRE(
251+
cudaSuccess
252+
== cub::DeviceSegmentedReduce::ArgMin(
253+
nullptr, expected_bytes_allocated, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1));
254+
255+
auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
256+
257+
device_segmented_reduce_argmin(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
258+
259+
thrust::host_vector<cub::KeyValuePair<int, int>> h_out(d_out);
260+
REQUIRE(h_out[0].key == 3);
261+
REQUIRE(h_out[0].value == 5);
262+
REQUIRE(h_out[1].key == 1);
263+
REQUIRE(h_out[1].value == 0);
264+
REQUIRE(h_out[2].key == 0);
265+
REQUIRE(h_out[2].value == 1);
266+
}
267+
268+
C2H_TEST("Device segmented argmax uses environment", "[segmented_reduce][device]")
269+
{
270+
int num_segments = 3;
271+
thrust::device_vector<int> d_offsets = {0, 4, 7, 9};
272+
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
273+
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9, 1, 2};
274+
thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
275+
276+
size_t expected_bytes_allocated{};
277+
REQUIRE(
278+
cudaSuccess
279+
== cub::DeviceSegmentedReduce::ArgMax(
280+
nullptr, expected_bytes_allocated, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1));
281+
282+
auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
283+
284+
device_segmented_reduce_argmax(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
285+
286+
thrust::host_vector<cub::KeyValuePair<int, int>> h_out(d_out);
287+
REQUIRE(h_out[0].key == 0);
288+
REQUIRE(h_out[0].value == 8);
289+
REQUIRE(h_out[1].key == 2);
290+
REQUIRE(h_out[1].value == 9);
291+
REQUIRE(h_out[2].key == 1);
292+
REQUIRE(h_out[2].value == 2);
293+
}
294+
241295
TEST_CASE("Device segmented reduce uses custom stream", "[segmented_reduce][device]")
242296
{
243297
int num_segments = 3;
@@ -367,60 +421,6 @@ TEST_CASE("Device segmented max uses custom stream", "[segmented_reduce][device]
367421
REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
368422
}
369423

370-
C2H_TEST("Device segmented argmin uses environment", "[segmented_reduce][device]")
371-
{
372-
int num_segments = 3;
373-
thrust::device_vector<int> d_offsets = {0, 4, 7, 9};
374-
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
375-
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9, 1, 2};
376-
thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
377-
378-
size_t expected_bytes_allocated{};
379-
REQUIRE(
380-
cudaSuccess
381-
== cub::DeviceSegmentedReduce::ArgMin(
382-
nullptr, expected_bytes_allocated, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1));
383-
384-
auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
385-
386-
device_segmented_reduce_argmin(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
387-
388-
thrust::host_vector<cub::KeyValuePair<int, int>> h_out(d_out);
389-
REQUIRE(h_out[0].key == 3);
390-
REQUIRE(h_out[0].value == 5);
391-
REQUIRE(h_out[1].key == 1);
392-
REQUIRE(h_out[1].value == 0);
393-
REQUIRE(h_out[2].key == 0);
394-
REQUIRE(h_out[2].value == 1);
395-
}
396-
397-
C2H_TEST("Device segmented argmax uses environment", "[segmented_reduce][device]")
398-
{
399-
int num_segments = 3;
400-
thrust::device_vector<int> d_offsets = {0, 4, 7, 9};
401-
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
402-
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9, 1, 2};
403-
thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
404-
405-
size_t expected_bytes_allocated{};
406-
REQUIRE(
407-
cudaSuccess
408-
== cub::DeviceSegmentedReduce::ArgMax(
409-
nullptr, expected_bytes_allocated, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1));
410-
411-
auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
412-
413-
device_segmented_reduce_argmax(d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, env);
414-
415-
thrust::host_vector<cub::KeyValuePair<int, int>> h_out(d_out);
416-
REQUIRE(h_out[0].key == 0);
417-
REQUIRE(h_out[0].value == 8);
418-
REQUIRE(h_out[1].key == 2);
419-
REQUIRE(h_out[1].value == 9);
420-
REQUIRE(h_out[2].key == 1);
421-
REQUIRE(h_out[2].value == 2);
422-
}
423-
424424
TEST_CASE("Device segmented argmin uses custom stream", "[segmented_reduce][device]")
425425
{
426426
int num_segments = 3;

cub/test/catch2_test_device_segmented_reduce_env_api.cu

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,27 @@ C2H_TEST("cub::DeviceSegmentedReduce::Reduce accepts run_to_run determinism requ
291291
REQUIRE(d_out == expected);
292292
REQUIRE(error == cudaSuccess);
293293
}
294+
295+
C2H_TEST("cub::DeviceSegmentedReduce::Reduce accepts not_guaranteed determinism requirements",
296+
"[segmented_reduce][env]")
297+
{
298+
int num_segments = 3;
299+
thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
300+
auto d_offsets_it = thrust::raw_pointer_cast(d_offsets.data());
301+
thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
302+
thrust::device_vector<int> d_out(3);
303+
304+
auto env = cuda::execution::require(cuda::execution::determinism::not_guaranteed);
305+
306+
auto error = cub::DeviceSegmentedReduce::Reduce(
307+
d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1, ::cuda::std::plus<>{}, 0, env);
308+
thrust::device_vector<int> expected{21, 0, 17};
309+
310+
if (error != cudaSuccess)
311+
{
312+
std::cerr << "cub::DeviceSegmentedReduce::Reduce failed with status: " << error << std::endl;
313+
}
314+
315+
REQUIRE(d_out == expected);
316+
REQUIRE(error == cudaSuccess);
317+
}

0 commit comments

Comments
 (0)