Skip to content

Commit 43f68e9

Browse files
author
Yihan Wang
authored
[SYCLomatic #1019] Add test for cub::DeviceSegmentedSort::{StableSortKeys, StableSortKeysDescending} (#370)
Signed-off-by: Wang, Yihan <[email protected]>
1 parent ecab0d7 commit 43f68e9

File tree

3 files changed

+283
-1
lines changed

3 files changed

+283
-1
lines changed
Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
// ===------- cub_device_segmented_sort_keys.cu ------------ *- CUDA -* ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//
8+
// ===---------------------------------------------------------------------===//
9+
10+
#include <cstdlib>
11+
#include <cub/cub.cuh>
12+
#include <iostream>
13+
#include <vector>
14+
15+
bool testStableSortKeys() {
16+
int num_items; // e.g., 7
17+
int num_segments; // e.g., 3
18+
int *d_offsets; // e.g., [0, 3, 3, 7]
19+
int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
20+
int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
21+
22+
num_items = 7;
23+
num_segments = 3;
24+
cudaMallocManaged(&d_offsets, (num_segments + 1) * sizeof(*d_offsets));
25+
cudaMallocManaged(&d_keys_in, num_items * sizeof(*d_keys_in));
26+
cudaMallocManaged(&d_keys_out, num_items * sizeof(*d_keys_out));
27+
28+
d_offsets[0] = 0;
29+
d_offsets[1] = 3;
30+
d_offsets[2] = 3;
31+
d_offsets[3] = 7;
32+
33+
d_keys_in[0] = 8;
34+
d_keys_in[1] = 6;
35+
d_keys_in[2] = 7;
36+
d_keys_in[3] = 5;
37+
d_keys_in[4] = 3;
38+
d_keys_in[5] = 0;
39+
d_keys_in[6] = 9;
40+
41+
cudaDeviceSynchronize();
42+
43+
// Determine temporary device storage requirements
44+
void *d_temp_storage = NULL;
45+
size_t temp_storage_bytes = 0;
46+
cub::DeviceSegmentedSort::StableSortKeys(
47+
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items,
48+
num_segments, d_offsets, d_offsets + 1);
49+
50+
// Allocate temporary storage
51+
cudaMalloc(&d_temp_storage, temp_storage_bytes);
52+
53+
// Run sorting operation
54+
cub::DeviceSegmentedSort::StableSortKeys(
55+
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items,
56+
num_segments, d_offsets, d_offsets + 1);
57+
58+
// d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
59+
// d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
60+
61+
std::vector<int> d_keys_out_expected = {6, 7, 8, 0, 3, 5, 9};
62+
std::vector<int> d_values_out_expected = {1, 2, 0, 5, 4, 3, 6};
63+
64+
cudaDeviceSynchronize();
65+
for (int i = 0; i < num_items; i++) {
66+
if (d_keys_out[i] != d_keys_out_expected[i]) {
67+
return false;
68+
}
69+
}
70+
71+
return true;
72+
}
73+
74+
bool testStableSortKeysDB() {
75+
// Declare, allocate, and initialize device-accessible pointers
76+
// for sorting data
77+
int num_items; // e.g., 7
78+
int num_segments; // e.g., 3
79+
int *d_offsets; // e.g., [0, 3, 3, 7]
80+
int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
81+
int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
82+
83+
num_items = 7;
84+
num_segments = 3;
85+
cudaMallocManaged(&d_offsets, (num_segments + 1) * sizeof(*d_offsets));
86+
cudaMallocManaged(&d_keys_in, num_items * sizeof(*d_keys_in));
87+
cudaMallocManaged(&d_keys_out, num_items * sizeof(*d_keys_out));
88+
89+
d_offsets[0] = 0;
90+
d_offsets[1] = 3;
91+
d_offsets[2] = 3;
92+
d_offsets[3] = 7;
93+
94+
d_keys_in[0] = 8;
95+
d_keys_in[1] = 6;
96+
d_keys_in[2] = 7;
97+
d_keys_in[3] = 5;
98+
d_keys_in[4] = 3;
99+
d_keys_in[5] = 0;
100+
d_keys_in[6] = 9;
101+
102+
cub::DoubleBuffer<int> d_keys(d_keys_in, d_keys_out);
103+
104+
cudaDeviceSynchronize();
105+
106+
// Determine temporary device storage requirements
107+
void *d_temp_storage = NULL;
108+
size_t temp_storage_bytes = 0;
109+
cub::DeviceSegmentedSort::StableSortKeys(d_temp_storage, temp_storage_bytes,
110+
d_keys, num_items, num_segments,
111+
d_offsets, d_offsets + 1);
112+
113+
// Allocate temporary storage
114+
cudaMalloc(&d_temp_storage, temp_storage_bytes);
115+
116+
// Run sorting operation
117+
cub::DeviceSegmentedSort::StableSortKeys(d_temp_storage, temp_storage_bytes,
118+
d_keys, num_items, num_segments,
119+
d_offsets, d_offsets + 1);
120+
121+
// d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
122+
// d_values.Current() <-- [1, 2, 0, 5, 4, 3, 6]
123+
124+
std::vector<int> d_keys_out_expected = {6, 7, 8, 0, 3, 5, 9};
125+
std::vector<int> d_values_out_expected = {1, 2, 0, 5, 4, 3, 6};
126+
127+
cudaDeviceSynchronize();
128+
for (int i = 0; i < num_items; i++) {
129+
if (d_keys.Current()[i] != d_keys_out_expected[i]) {
130+
return false;
131+
}
132+
}
133+
134+
return true;
135+
}
136+
137+
bool testStableSortKeysDescending() {
138+
int num_items; // e.g., 7
139+
int num_segments; // e.g., 3
140+
int *d_offsets; // e.g., [0, 3, 3, 7]
141+
int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
142+
int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
143+
144+
num_items = 7;
145+
num_segments = 3;
146+
cudaMallocManaged(&d_offsets, (num_segments + 1) * sizeof(*d_offsets));
147+
cudaMallocManaged(&d_keys_in, num_items * sizeof(*d_keys_in));
148+
cudaMallocManaged(&d_keys_out, num_items * sizeof(*d_keys_out));
149+
150+
d_offsets[0] = 0;
151+
d_offsets[1] = 3;
152+
d_offsets[2] = 3;
153+
d_offsets[3] = 7;
154+
155+
d_keys_in[0] = 8;
156+
d_keys_in[1] = 6;
157+
d_keys_in[2] = 7;
158+
d_keys_in[3] = 5;
159+
d_keys_in[4] = 3;
160+
d_keys_in[5] = 0;
161+
d_keys_in[6] = 9;
162+
163+
cudaDeviceSynchronize();
164+
165+
// Determine temporary device storage requirements
166+
void *d_temp_storage = NULL;
167+
size_t temp_storage_bytes = 0;
168+
cub::DeviceSegmentedSort::StableSortKeysDescending(
169+
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items,
170+
num_segments, d_offsets, d_offsets + 1);
171+
172+
// Allocate temporary storage
173+
cudaMalloc(&d_temp_storage, temp_storage_bytes);
174+
175+
// Run sorting operation
176+
cub::DeviceSegmentedSort::StableSortKeysDescending(
177+
d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items,
178+
num_segments, d_offsets, d_offsets + 1);
179+
180+
// d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
181+
// d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
182+
183+
std::vector<int> d_keys_out_expected = {8, 7, 6, 9, 5, 3, 0};
184+
std::vector<int> d_values_out_expected = {0, 2, 1, 6, 3, 4, 5};
185+
186+
cudaDeviceSynchronize();
187+
for (int i = 0; i < num_items; i++) {
188+
if (d_keys_out[i] != d_keys_out_expected[i]) {
189+
return false;
190+
}
191+
}
192+
193+
return true;
194+
}
195+
196+
bool testStableSortKeysDescendingDB() {
197+
int num_items; // e.g., 7
198+
int num_segments; // e.g., 3
199+
int *d_offsets; // e.g., [0, 3, 3, 7]
200+
int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
201+
int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
202+
203+
num_items = 7;
204+
num_segments = 3;
205+
cudaMallocManaged(&d_offsets, (num_segments + 1) * sizeof(*d_offsets));
206+
cudaMallocManaged(&d_keys_in, num_items * sizeof(*d_keys_in));
207+
cudaMallocManaged(&d_keys_out, num_items * sizeof(*d_keys_out));
208+
209+
d_offsets[0] = 0;
210+
d_offsets[1] = 3;
211+
d_offsets[2] = 3;
212+
d_offsets[3] = 7;
213+
214+
d_keys_in[0] = 8;
215+
d_keys_in[1] = 6;
216+
d_keys_in[2] = 7;
217+
d_keys_in[3] = 5;
218+
d_keys_in[4] = 3;
219+
d_keys_in[5] = 0;
220+
d_keys_in[6] = 9;
221+
222+
cub::DoubleBuffer<int> d_keys(d_keys_in, d_keys_out);
223+
224+
cudaDeviceSynchronize();
225+
226+
// Determine temporary device storage requirements
227+
void *d_temp_storage = NULL;
228+
size_t temp_storage_bytes = 0;
229+
cub::DeviceSegmentedSort::StableSortKeysDescending(
230+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments,
231+
d_offsets, d_offsets + 1);
232+
233+
// Allocate temporary storage
234+
cudaMalloc(&d_temp_storage, temp_storage_bytes);
235+
236+
// Run sorting operation
237+
cub::DeviceSegmentedSort::StableSortKeysDescending(
238+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments,
239+
d_offsets, d_offsets + 1);
240+
241+
// d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
242+
// d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
243+
244+
std::vector<int> d_keys_out_expected = {8, 7, 6, 9, 5, 3, 0};
245+
std::vector<int> d_values_out_expected = {0, 2, 1, 6, 3, 4, 5};
246+
247+
cudaDeviceSynchronize();
248+
for (int i = 0; i < num_items; i++) {
249+
if (d_keys.Current()[i] != d_keys_out_expected[i]) {
250+
return false;
251+
}
252+
}
253+
254+
return true;
255+
}
256+
257+
int main() {
258+
if (!testStableSortKeys()) {
259+
std::cerr << "StableSortKeys test failed" << std::endl;
260+
return EXIT_FAILURE;
261+
}
262+
263+
if (!testStableSortKeysDB()) {
264+
std::cerr << "StableSortKeys (DoubleBuffer) test failed" << std::endl;
265+
return EXIT_FAILURE;
266+
}
267+
268+
if (!testStableSortKeysDescending()) {
269+
std::cerr << "StableSortKeysDescending test failed" << std::endl;
270+
return EXIT_FAILURE;
271+
}
272+
273+
if (!testStableSortKeysDescendingDB()) {
274+
std::cerr << "StableSortKeysDescending (DoubleBuffer) test failed"
275+
<< std::endl;
276+
return EXIT_FAILURE;
277+
}
278+
279+
std::cout << "Tests passed" << std::endl;
280+
return EXIT_SUCCESS;
281+
}

features/features.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@
171171
<test testName="cub_device_seg_radix_sort_pairs" configFile="config/TEMPLATE_cub_device_radix_sort.xml" />
172172
<test testName="cub_device_seg_radix_sort_keys" configFile="config/TEMPLATE_cub_device_radix_sort.xml" />
173173
<test testName="cub_device_segmented_sort_pairs" configFile="config/TEMPLATE_cub_device_segmented_sort.xml" />
174+
<test testName="cub_device_segmented_sort_keys" configFile="config/TEMPLATE_cub_device_segmented_sort.xml" />
174175
<test testName="cub_device_seg_sort_keys" configFile="config/TEMPLATE_cub_device.xml" />
175176
<test testName="cub_device_seg_sort_pairs" configFile="config/TEMPLATE_cub_device.xml" />
176177
<test testName="cub_intrinsic" configFile="config/TEMPLATE_cub_device.xml" />

features/test_feature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
'thrust-scatter', 'thrust-unique_by_key_copy', 'thrust-for-hypre', 'thrust-merge_by_key',
2424
'thrust-rawptr-noneusm', 'driverStreamAndEvent', 'grid_sync', 'deviceProp', 'gridThreads', 'kernel_library', 'cub_block_p2',
2525
'cub_constant_iterator', 'cub_device_reduce_max', 'cub_device_reduce_min', 'cub_discard_iterator', 'ccl-test', 'ccl-test2',
26-
'cub_device', 'cub_device_reduce_sum', 'cub_device_reduce', 'cub_device_reduce_by_key', 'cub_device_select_unique_by_key',
26+
'cub_device', 'cub_device_reduce_sum', 'cub_device_reduce', 'cub_device_reduce_by_key', 'cub_device_select_unique_by_key', 'cub_device_segmented_sort_keys',
2727
'cub_device_scan_inclusive_scan', 'cub_device_scan_exclusive_scan', 'cub_device_seg_radix_sort_pairs',
2828
'cub_device_scan_inclusive_sum', 'cub_device_scan_exclusive_sum', 'cub_device_select_unique', 'cub_device_radix_sort_keys', 'cub_device_radix_sort_pairs',
2929
'cub_device_select_flagged', 'cub_device_run_length_encide_encode', 'cub_counting_iterator', 'cub_arg_index_input_iterator', 'cub_device_seg_radix_sort_keys',

0 commit comments

Comments
 (0)