Skip to content

Commit 8531065

Browse files
authored
[SYCLomatic #941] Add 6 warp reduce functions tests. (#366)
Signed-off-by: Tang, Jiajun [email protected]
1 parent 57ed3b9 commit 8531065

File tree

3 files changed

+171
-0
lines changed

3 files changed

+171
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
3+
<test driverID="test_feature" name="TEMPLATE">
4+
<description>test</description>
5+
<files>
6+
<file path="feature_case/cpp_language_extensions/${testName}.cu" />
7+
</files>
8+
<rules>
9+
<platformRule OSFamily="Linux" kit="CUDA11.0" kitRange="OLDER" runOnThisPlatform="false"/>
10+
<platformRule OSFamily="Windows" kit="CUDA11.0" kitRange="OLDER" runOnThisPlatform="false"/>
11+
</rules>
12+
</test>
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
// ===----------- warp_reduce_functions.cu---------- *- CUDA -* -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//
8+
// ===---------------------------------------------------------------------===//
9+
#include <cuda_runtime.h>
10+
#include <iostream>
11+
#include <vector>
12+
13+
#define DATA_NUM 128
14+
15+
void print_data(unsigned *data) {
16+
std::vector<unsigned> host_data(DATA_NUM);
17+
cudaMemcpy(host_data.data(), data, DATA_NUM * sizeof(unsigned),
18+
cudaMemcpyDeviceToHost);
19+
for (int i = 0; i < DATA_NUM; i++) {
20+
std::cout << host_data[i] << ", ";
21+
if ((i + 1) % 32 == 0)
22+
std::cout << std::endl;
23+
}
24+
std::cout << std::endl;
25+
}
26+
27+
bool check_data(unsigned *data, int first, int second, int third, int fourth) {
28+
std::vector<unsigned> host_data(DATA_NUM);
29+
cudaMemcpy(host_data.data(), data, DATA_NUM * sizeof(unsigned),
30+
cudaMemcpyDeviceToHost);
31+
for (int i = 0; i < DATA_NUM / 4; i++) {
32+
if (host_data[i] != first)
33+
return false;
34+
}
35+
for (int i = DATA_NUM / 4; i < DATA_NUM / 2; i++) {
36+
if (host_data[i] != second)
37+
return false;
38+
}
39+
for (int i = DATA_NUM / 2; i < DATA_NUM * 3 / 4; i++) {
40+
if (host_data[i] != third)
41+
return false;
42+
}
43+
for (int i = DATA_NUM * 3 / 4; i < DATA_NUM; i++) {
44+
if (host_data[i] != fourth)
45+
return false;
46+
}
47+
return true;
48+
}
49+
50+
__global__ void reduce_add_sync(unsigned int *data) {
51+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
52+
threadIdx.z * blockDim.x * blockDim.y +
53+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
54+
data[thread_id] = __reduce_add_sync(0xFFFF, thread_id);
55+
}
56+
57+
__global__ void reduce_min_sync(unsigned int *data) {
58+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
59+
threadIdx.z * blockDim.x * blockDim.y +
60+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
61+
data[thread_id] = __reduce_min_sync(0xFFFF, thread_id);
62+
}
63+
64+
__global__ void reduce_max_sync(unsigned int *data) {
65+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
66+
threadIdx.z * blockDim.x * blockDim.y +
67+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
68+
data[thread_id] = __reduce_max_sync(0xFFFF, thread_id);
69+
}
70+
71+
__global__ void reduce_and_sync(unsigned int *data) {
72+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
73+
threadIdx.z * blockDim.x * blockDim.y +
74+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
75+
data[thread_id] = __reduce_and_sync(0xFFFF, thread_id);
76+
}
77+
78+
__global__ void reduce_or_sync(unsigned int *data) {
79+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
80+
threadIdx.z * blockDim.x * blockDim.y +
81+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
82+
data[thread_id] = __reduce_or_sync(0xFFFF, thread_id);
83+
}
84+
85+
__global__ void reduce_xor_sync(unsigned int *data) {
86+
int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
87+
threadIdx.z * blockDim.x * blockDim.y +
88+
blockIdx.x * blockDim.x * blockDim.y * blockDim.z;
89+
data[thread_id] = __reduce_xor_sync(0xFFFF, thread_id);
90+
}
91+
92+
int main() {
93+
int ret = 0;
94+
unsigned *dev_data = nullptr;
95+
cudaMalloc(&dev_data, DATA_NUM * sizeof(unsigned int));
96+
97+
reduce_add_sync<<<2, 64>>>(dev_data);
98+
cudaDeviceSynchronize();
99+
if (!check_data(dev_data, 496, 1520, 2544, 3568)) {
100+
print_data(dev_data);
101+
std::cout << "reduce_add_sync check failed!" << std::endl;
102+
ret++;
103+
} else {
104+
std::cout << "reduce_add_sync check passed!" << std::endl;
105+
}
106+
107+
reduce_min_sync<<<2, 64>>>(dev_data);
108+
cudaDeviceSynchronize();
109+
if (!check_data(dev_data, 0, 32, 64, 96)) {
110+
print_data(dev_data);
111+
std::cout << "reduce_min_sync check failed!" << std::endl;
112+
ret++;
113+
} else {
114+
std::cout << "reduce_min_sync check passed!" << std::endl;
115+
}
116+
117+
reduce_max_sync<<<2, 64>>>(dev_data);
118+
cudaDeviceSynchronize();
119+
if (!check_data(dev_data, 31, 63, 95, 127)) {
120+
print_data(dev_data);
121+
std::cout << "reduce_max_sync check failed!" << std::endl;
122+
ret++;
123+
} else {
124+
std::cout << "reduce_max_sync check passed!" << std::endl;
125+
}
126+
127+
reduce_and_sync<<<2, 64>>>(dev_data);
128+
cudaDeviceSynchronize();
129+
if (!check_data(dev_data, 0, 32, 64, 96)) {
130+
print_data(dev_data);
131+
std::cout << "reduce_and_sync check failed!" << std::endl;
132+
ret++;
133+
} else {
134+
std::cout << "reduce_and_sync check passed!" << std::endl;
135+
}
136+
137+
reduce_or_sync<<<2, 64>>>(dev_data);
138+
cudaDeviceSynchronize();
139+
if (!check_data(dev_data, 31, 63, 95, 127)) {
140+
print_data(dev_data);
141+
std::cout << "reduce_or_sync check failed!" << std::endl;
142+
ret++;
143+
} else {
144+
std::cout << "reduce_or_sync check passed!" << std::endl;
145+
}
146+
147+
reduce_xor_sync<<<2, 64>>>(dev_data);
148+
cudaDeviceSynchronize();
149+
if (!check_data(dev_data, 0, 0, 0, 0)) {
150+
print_data(dev_data);
151+
std::cout << "reduce_xor_sync check failed!" << std::endl;
152+
ret++;
153+
} else {
154+
std::cout << "reduce_xor_sync check passed!" << std::endl;
155+
}
156+
157+
return ret;
158+
}

features/features.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,5 +248,6 @@
248248
<test testName="occupancy_calculation" configFile="config/TEMPLATE_misc.xml" />
249249
<!-- <test testName="math_intel_specific" configFile="config/TEMPLATE_math_intel_specific.xml" splitGroup="double" /> -->
250250
<test testName="remove_unnecessary_wait" configFile="config/TEMPLATE_remove_unnecessary_wait.xml" />
251+
<test testName="warp_reduce_functions" configFile="config/TEMPLATE_cpp_language_extensions.xml" />
251252
</tests>
252253
</suite>

0 commit comments

Comments
 (0)