Skip to content

Commit b1bfa0e

Browse files
authored
[SYCLomatic #691] Add test case for masked version __shfl* function migration (#255)
Signed-off-by: Wang, Hao3 <[email protected]>
1 parent d3930d3 commit b1bfa0e

File tree

4 files changed

+399
-2
lines changed

4 files changed

+399
-2
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
3+
<test driverID="test_feature" name="TEMPLATE">
4+
<description>test</description>
5+
<files>
6+
<file path="feature_case/sync_warp_p2/${testName}.cu" />
7+
</files>
8+
<rules>
9+
<optlevelRule excludeOptlevelNameString="cpu" />
10+
<platformRule OSFamily="Linux" kit="CUDA9.2" kitRange="OLDER" runOnThisPlatform="false"/>
11+
<platformRule OSFamily="Windows" kit="CUDA9.2" kitRange="OLDER" runOnThisPlatform="false"/>
12+
</rules>
13+
</test>
14+
Lines changed: 381 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,381 @@
1+
// ====------ sync_warp_p2.cu---------- *- CUDA -* ----===////
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//
8+
// ===----------------------------------------------------------------------===//
9+
#include <iostream>
10+
#include <cuda_runtime.h>
11+
#include <vector>
12+
#include <set>
13+
#define WARP_SIZE 32
14+
#define DATA_NUM 128
15+
16+
template<typename T = int>
17+
void init_data(T* data, int num) {
18+
std::vector<T> host_data(num);
19+
for(int i = 0; i < num; i++)
20+
host_data[i] = i + 3;
21+
cudaMemcpy(data, host_data.data(), num * sizeof(T), cudaMemcpyHostToDevice);
22+
}
23+
24+
template<typename T = int>
25+
bool verify_data(T* data, T* expect, int num, int step = 1, std::set<int> ignore_index = {}) {
26+
std::vector<T> host_data(num);
27+
cudaMemcpy(host_data.data(), data, num * sizeof(T), cudaMemcpyDeviceToHost);
28+
for(int i = 0; i < num; i = i + step) {
29+
if(ignore_index.count(i))
30+
continue;
31+
if(host_data[i] != expect[i]) {
32+
return false;
33+
}
34+
}
35+
return true;
36+
}
37+
38+
template<typename T = int>
39+
void print_data(T* data, int num, bool is_host = false) {
40+
if(is_host) {
41+
for (int i = 0; i < num; i++) {
42+
std::cout << data[i] << ", ";
43+
if((i+1)%16 == 0)
44+
std::cout << std::endl;
45+
}
46+
std::cout << std::endl;
47+
return;
48+
}
49+
std::vector<T> host_data(num);
50+
cudaMemcpy(host_data.data(), data, num * sizeof(T), cudaMemcpyDeviceToHost);
51+
for (int i = 0; i < num; i++) {
52+
std::cout << host_data[i] << ", ";
53+
if((i+1)%16 == 0)
54+
std::cout << std::endl;
55+
}
56+
std::cout << std::endl;
57+
}
58+
59+
60+
//sync API
61+
__global__ void ShuffleSyncKernel1(unsigned int* data) {
62+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
63+
int output = 0;
64+
unsigned int mask = 0xFFFFFFF0;
65+
output = __shfl_sync(mask, threadid, threadid + 1, 16);
66+
data[threadid] = output;
67+
}
68+
__global__ void ShuffleUpSyncKernel1(unsigned int* data) {
69+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
70+
int output = 0;
71+
unsigned int mask = 0xFFFFFFF0;
72+
output = __shfl_up_sync(mask, threadid, 1, 16);
73+
data[threadid] = output;
74+
}
75+
__global__ void ShuffleDownSyncKernel1(unsigned int* data) {
76+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
77+
int output = 0;
78+
unsigned int mask = 0xFFFFFFF0;
79+
output = __shfl_down_sync(mask, threadid, 1, 16);
80+
data[threadid] = output;
81+
}
82+
__global__ void ShuffleXorSyncKernel1(unsigned int* data) {
83+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
84+
int output = 0;
85+
unsigned int mask = 0xFFFFFFF0;
86+
output = __shfl_xor_sync(mask, threadid, 2, 16);
87+
data[threadid] = output;
88+
}
89+
90+
//has branch1
91+
__global__ void ShuffleSyncKernel2(unsigned int* data) {
92+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
93+
int output = 0;
94+
unsigned int mask = 0xFFFFFFF0;
95+
if(threadid%32 >3) {
96+
output = __shfl_sync(mask, threadid, threadid + 1, 16);
97+
}
98+
data[threadid] = output;
99+
}
100+
__global__ void ShuffleUpSyncKernel2(unsigned int* data) {
101+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
102+
int output = 0;
103+
unsigned int mask = 0xFFFFFFF0;
104+
if(threadid%32 >3) {
105+
output = __shfl_up_sync(mask, threadid, 1, 16);
106+
}
107+
data[threadid] = output;
108+
}
109+
__global__ void ShuffleDownSyncKernel2(unsigned int* data) {
110+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
111+
int output = 0;
112+
unsigned int mask = 0x0FFFFFFF;
113+
if(threadid%32 < 28) {
114+
output = __shfl_down_sync(mask, threadid, 1, 16);
115+
}
116+
data[threadid] = output;
117+
}
118+
__global__ void ShuffleXorSyncKernel2(unsigned int* data) {
119+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
120+
int output = 0;
121+
unsigned int mask = 0xFFFFFFF0;
122+
if(threadid%32 >3) {
123+
output = __shfl_xor_sync(mask, threadid, 2, 16);
124+
}
125+
data[threadid] = output;
126+
}
127+
128+
// has branch 2
129+
__global__ void ShuffleSyncKernel3(unsigned int* data) {
130+
int threadid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y + blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
131+
int output = 0;
132+
unsigned int mask = 0xFFFFFFF0;
133+
if(threadid%32 >3) {
134+
unsigned remote = threadid ? threadid - 1 : 0;
135+
output = __shfl_sync(mask, threadid, remote , 16);
136+
}
137+
data[threadid] = output;
138+
}
139+
140+
int main() {
141+
bool Result = true;
142+
int* dev_data = nullptr;
143+
unsigned int *dev_data_u = nullptr;
144+
std::set<int> ignore_index;
145+
dim3 GridSize;
146+
dim3 BlockSize;
147+
cudaMalloc(&dev_data, DATA_NUM * sizeof(int));
148+
cudaMalloc(&dev_data_u, DATA_NUM * sizeof(unsigned int));
149+
GridSize = {2};
150+
BlockSize = {32, 2, 1};
151+
// NV hardware result reference
152+
unsigned int expect1[DATA_NUM] = {
153+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 16,
154+
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 32, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 48,
155+
65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 64, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 80,
156+
97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 96, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 112,
157+
};
158+
init_data<unsigned int>(dev_data_u, DATA_NUM);
159+
160+
ShuffleSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u);
161+
162+
cudaDeviceSynchronize();
163+
if(!verify_data<unsigned int>(dev_data_u, expect1, DATA_NUM)) {
164+
std::cout << "ShuffleSyncKernel1" << " verify failed" << std::endl;
165+
Result = false;
166+
std::cout << "expect:" << std::endl;
167+
print_data<unsigned int>(expect1, DATA_NUM, true);
168+
std::cout << "current result:" << std::endl;
169+
print_data<unsigned int>(dev_data_u, DATA_NUM);
170+
}
171+
172+
GridSize = {2};
173+
BlockSize = {32, 2, 1};
174+
// NV hardware result reference
175+
unsigned int expect2[DATA_NUM] = {
176+
0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
177+
32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
178+
64, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
179+
96, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
180+
};
181+
init_data<unsigned int>(dev_data_u, DATA_NUM);
182+
183+
ShuffleUpSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u);
184+
185+
cudaDeviceSynchronize();
186+
if(!verify_data<unsigned int>(dev_data_u, expect2, DATA_NUM)) {
187+
std::cout << "ShuffleUpSyncKernel1" << " verify failed" << std::endl;
188+
Result = false;
189+
std::cout << "expect:" << std::endl;
190+
print_data<unsigned int>(expect2, DATA_NUM, true);
191+
std::cout << "current result:" << std::endl;
192+
print_data<unsigned int>(dev_data_u, DATA_NUM);
193+
}
194+
195+
196+
GridSize = {2};
197+
BlockSize = {32, 2, 1};
198+
// NV hardware result reference
199+
unsigned int expect3[DATA_NUM] = {
200+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31,
201+
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 63,
202+
65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 95,
203+
97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127,
204+
};
205+
init_data<unsigned int>(dev_data_u, DATA_NUM);
206+
207+
ShuffleDownSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u);
208+
209+
cudaDeviceSynchronize();
210+
if(!verify_data<unsigned int>(dev_data_u, expect3, DATA_NUM)) {
211+
std::cout << "ShuffleDownSyncKernel1" << " verify failed" << std::endl;
212+
Result = false;
213+
std::cout << "expect:" << std::endl;
214+
print_data<unsigned int>(expect3, DATA_NUM, true);
215+
std::cout << "current result:" << std::endl;
216+
print_data<unsigned int>(dev_data_u, DATA_NUM);
217+
}
218+
219+
GridSize = {2};
220+
BlockSize = {32, 2, 1};
221+
// NV hardware result reference
222+
unsigned int expect4[DATA_NUM] = {
223+
2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,
224+
28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,
225+
59,56,57,62,63,60,61,66,67,64,65,70,71,68,69,74,75,72,73,78,79,76,77,82,83,80,81,
226+
86,87,84,85,90,91,88,89,94,95,92,93,98,99,96,97,102,103,100,101,106,107,104,105,
227+
110,111,108,109,114,115,112,113,118,119,116,117,122,123,120,121,126,127,124,125
228+
};
229+
init_data<unsigned int>(dev_data_u, DATA_NUM);
230+
231+
ShuffleXorSyncKernel1<<<GridSize, BlockSize>>>(dev_data_u);
232+
233+
cudaDeviceSynchronize();
234+
if(!verify_data<unsigned int>(dev_data_u, expect4, DATA_NUM)) {
235+
std::cout << "ShuffleXorSyncKernel1" << " verify failed" << std::endl;
236+
Result = false;
237+
std::cout << "expect:" << std::endl;
238+
print_data<unsigned int>(expect4, DATA_NUM, true);
239+
std::cout << "current result:" << std::endl;
240+
print_data<unsigned int>(dev_data_u, DATA_NUM);
241+
}
242+
243+
// has branch 1
244+
GridSize = {2};
245+
BlockSize = {32, 2, 1};
246+
// NV hardware result reference
247+
unsigned int expect5[DATA_NUM] = {
248+
0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 16,
249+
0, 0, 0, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 48,
250+
0, 0, 0, 0, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 0, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 80,
251+
0, 0, 0, 0, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 0, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 112,
252+
};
253+
init_data<unsigned int>(dev_data_u, DATA_NUM);
254+
255+
ShuffleSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u);
256+
257+
cudaDeviceSynchronize();
258+
// The result[15, 47, 79, 111] is undefined, so ignore those value.
259+
ignore_index = {15, 47, 79, 111};
260+
if(!verify_data<unsigned int>(dev_data_u, expect5, DATA_NUM, 1, ignore_index)) {
261+
std::cout << "ShuffleSyncKernel2" << " verify failed" << std::endl;
262+
Result = false;
263+
std::cout << "expect:" << std::endl;
264+
print_data<unsigned int>(expect5, DATA_NUM, true);
265+
std::cout << "current result:" << std::endl;
266+
print_data<unsigned int>(dev_data_u, DATA_NUM);
267+
}
268+
269+
GridSize = {2};
270+
BlockSize = {32, 2, 1};
271+
// NV hardware result reference
272+
unsigned int expect6[DATA_NUM] = {
273+
0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
274+
0, 0, 0, 0, 0, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
275+
0, 0, 0, 0, 0, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
276+
0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
277+
};
278+
init_data<unsigned int>(dev_data_u, DATA_NUM);
279+
280+
ShuffleUpSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u);
281+
282+
cudaDeviceSynchronize();
283+
// The result[4, 36, 68, 100] is undefined, so ignore those value.
284+
ignore_index = {4, 36, 68, 100};
285+
if(!verify_data<unsigned int>(dev_data_u, expect6, DATA_NUM, 1, ignore_index)) {
286+
std::cout << "ShuffleUpSyncKernel2" << " verify failed" << std::endl;
287+
Result = false;
288+
std::cout << "expect:" << std::endl;
289+
print_data<unsigned int>(expect6, DATA_NUM, true);
290+
std::cout << "current result:" << std::endl;
291+
print_data<unsigned int>(dev_data_u, DATA_NUM);
292+
}
293+
294+
295+
GridSize = {2};
296+
BlockSize = {32, 2, 1};
297+
// NV hardware result reference
298+
// The result[27/59/91/123] of _shfl_down function in delta 4 and logical warp size 16 is undefined.
299+
// But the SYCL version return 28/60/92/124, so we change these 4 number in reference to result of
300+
// SYCL version function.
301+
unsigned int expect7[DATA_NUM] = {
302+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 0, 0, 0, 0,
303+
33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 0, 0, 0, 0, 0,
304+
65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 0, 0, 0, 0, 0,
305+
97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 0, 0, 0, 0, 0,
306+
};
307+
init_data<unsigned int>(dev_data_u, DATA_NUM);
308+
309+
ShuffleDownSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u);
310+
311+
cudaDeviceSynchronize();
312+
// The result[27, 59, 91, 123] is undefined, so ignore those value.
313+
ignore_index = {27, 59, 91, 123};
314+
if(!verify_data<unsigned int>(dev_data_u, expect7, DATA_NUM, 1, ignore_index)) {
315+
std::cout << "ShuffleDownSyncKernel2" << " verify failed" << std::endl;
316+
Result = false;
317+
std::cout << "expect:" << std::endl;
318+
print_data<unsigned int>(expect7, DATA_NUM, true);
319+
std::cout << "current result:" << std::endl;
320+
print_data<unsigned int>(dev_data_u, DATA_NUM);
321+
}
322+
323+
GridSize = {2};
324+
BlockSize = {32, 2, 1};
325+
// NV hardware result reference
326+
unsigned int expect8[DATA_NUM] = {
327+
0, 0, 0, 0, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 18, 19, 16, 17, 22, 23, 20, 21, 26, 27, 24, 25, 30, 31, 28, 29,
328+
0, 0, 0, 0, 38, 39, 36, 37, 42, 43, 40, 41, 46, 47, 44, 45, 50, 51, 48, 49, 54, 55, 52, 53, 58, 59, 56, 57, 62, 63, 60, 61,
329+
0, 0, 0, 0, 70, 71, 68, 69, 74, 75, 72, 73, 78, 79, 76, 77, 82, 83, 80, 81, 86, 87, 84, 85, 90, 91, 88, 89, 94, 95, 92, 93,
330+
0, 0, 0, 0, 102, 103, 100, 101, 106, 107, 104, 105, 110, 111, 108, 109, 114, 115, 112, 113, 118, 119, 116, 117, 122, 123, 120, 121, 126, 127, 124, 125,
331+
};
332+
init_data<unsigned int>(dev_data_u, DATA_NUM);
333+
334+
ShuffleXorSyncKernel2<<<GridSize, BlockSize>>>(dev_data_u);
335+
336+
cudaDeviceSynchronize();
337+
if(!verify_data<unsigned int>(dev_data_u, expect8, DATA_NUM)) {
338+
std::cout << "ShuffleXorSyncKernel2" << " verify failed" << std::endl;
339+
Result = false;
340+
std::cout << "expect:" << std::endl;
341+
print_data<unsigned int>(expect8, DATA_NUM, true);
342+
std::cout << "current result:" << std::endl;
343+
print_data<unsigned int>(dev_data_u, DATA_NUM);
344+
}
345+
346+
// has branch 2
347+
348+
GridSize = {2};
349+
BlockSize = {32, 2, 1};
350+
// NV hardware result reference
351+
unsigned int expect9[DATA_NUM] = {
352+
0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
353+
0, 0, 0, 0, 0, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 63, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
354+
0, 0, 0, 0, 0, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 95, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
355+
0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 127, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
356+
};
357+
init_data<unsigned int>(dev_data_u, DATA_NUM);
358+
359+
ShuffleSyncKernel3<<<GridSize, BlockSize>>>(dev_data_u);
360+
361+
cudaDeviceSynchronize();
362+
// The result[4, 36, 68, 100] is undefined, so ignore those value.
363+
ignore_index = {4, 36, 68, 100};
364+
if(!verify_data<unsigned int>(dev_data_u, expect9, DATA_NUM, 1, ignore_index)) {
365+
std::cout << "ShuffleSyncKernel3" << " verify failed" << std::endl;
366+
Result = false;
367+
std::cout << "expect:" << std::endl;
368+
print_data<unsigned int>(expect9, DATA_NUM, true);
369+
std::cout << "current result:" << std::endl;
370+
print_data<unsigned int>(dev_data_u, DATA_NUM);
371+
}
372+
373+
if(Result)
374+
std::cout << "passed" << std::endl;
375+
else {
376+
exit(-1);
377+
}
378+
return 0;
379+
}
380+
381+

0 commit comments

Comments
 (0)