Skip to content

Commit 53db249

Browse files
authored
[SYCLomatic #2619] Add test for migration of asm PTX red instruction (#873)
Signed-off-by: chenwei.sun <[email protected]>
1 parent 7f09f61 commit 53db249

File tree

3 files changed

+284
-1
lines changed

3 files changed

+284
-1
lines changed

features/feature_case/asm/asm_red.cu

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
// ====------ asm_red.cu ---------------------------------- *- CUDA -* ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//
8+
// ===---------------------------------------------------------------------===//
9+
10+
#include "cuda.h"
11+
#include <cstdint>
12+
#include <cuda_runtime.h>
13+
#include <iostream>
14+
15+
__global__ void relaxed_add_kernel(float *data, int n) {
16+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
17+
18+
if (idx < n) {
19+
float value = data[idx];
20+
21+
asm volatile("red.relaxed.gpu.global.add.f32 [%0], %1;"
22+
:
23+
: "l"(data), "f"(value));
24+
}
25+
}
26+
27+
__global__ void relaxed_or_kernel(int *data, int n) {
28+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
29+
30+
if (idx < n) {
31+
int value = data[idx];
32+
33+
asm volatile("red.relaxed.gpu.global.or.b32 [%0], %1;"
34+
:
35+
: "l"(data), "r"(value));
36+
}
37+
}
38+
39+
__global__ void relaxed_xor_kernel(int *data, int n) {
40+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
41+
42+
if (idx < n) {
43+
int value = data[idx];
44+
45+
asm volatile("red.relaxed.gpu.global.xor.b32 [%0], %1;"
46+
:
47+
: "l"(data), "r"(value));
48+
}
49+
}
50+
51+
__global__ void relaxed_and_kernel(int *data, int n) {
52+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
53+
54+
if (idx < n) {
55+
int value = data[idx];
56+
57+
asm volatile("red.relaxed.gpu.global.and.b32 [%0], %1;"
58+
:
59+
: "l"(data), "r"(value));
60+
}
61+
}
62+
63+
__global__ void relaxed_max_kernel(int *data, int n) {
64+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
65+
66+
if (idx < n) {
67+
int value = data[idx];
68+
69+
asm volatile("red.relaxed.gpu.global.max.s32 [%0], %1;"
70+
:
71+
: "l"(data), "r"(value));
72+
}
73+
}
74+
75+
__global__ void relaxed_min_kernel(int *data, int n) {
76+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
77+
78+
if (idx < n) {
79+
int value = data[idx];
80+
81+
asm volatile("red.relaxed.gpu.global.min.s32 [%0], %1;"
82+
:
83+
: "l"(data), "r"(value));
84+
}
85+
}
86+
87+
void relaxed_add_kernel_test(void) {
88+
const int size = 100;
89+
float *d_data, h_data[size];
90+
91+
// Initialize host data
92+
for (int i = 0; i < size; i++) {
93+
h_data[i] = static_cast<float>(i);
94+
}
95+
96+
// Allocate device memory
97+
cudaMalloc(&d_data, size * sizeof(float));
98+
99+
// Copy data to device
100+
cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
101+
102+
relaxed_add_kernel<<<1, size>>>(d_data, size);
103+
cudaDeviceSynchronize();
104+
// Copy results back to host
105+
cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
106+
107+
// Free device memory
108+
cudaFree(d_data);
109+
110+
if (h_data[0] != 4950) {
111+
std::cout << "add value: " << h_data[0] << std::endl;
112+
std::cout << "relaxed_add_kernel_test run failed!\n";
113+
exit(-1);
114+
}
115+
std::cout << "relaxed_add_kernel_test run passed!\n";
116+
}
117+
118+
void relaxed_or_kernel_test(void) {
119+
const int size = 50;
120+
int *d_data, h_data[size];
121+
122+
// Initialize host data
123+
for (int i = 0; i < size; i++) {
124+
h_data[i] = 0xF;
125+
}
126+
127+
// Allocate device memory
128+
cudaMalloc(&d_data, size * sizeof(int));
129+
130+
// Copy data to device
131+
cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice);
132+
133+
relaxed_or_kernel<<<1, size>>>(d_data, size);
134+
cudaDeviceSynchronize();
135+
// Copy results back to host
136+
cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
137+
138+
// Free device memory
139+
cudaFree(d_data);
140+
141+
if (h_data[0] != 0xF) {
142+
std::cout << "or value: " << h_data[0] << std::endl;
143+
std::cout << "relaxed_or_kernel_test run failed!\n";
144+
exit(-1);
145+
}
146+
std::cout << "relaxed_or_kernel_test run passed!\n";
147+
}
148+
149+
void relaxed_xor_kernel_test(void) {
150+
const int size = 2;
151+
int *d_data, h_data[size];
152+
153+
// Initialize host data
154+
for (int i = 0; i < size; i++) {
155+
h_data[i] = 0xFFFFFFFF;
156+
}
157+
158+
// Allocate device memory
159+
cudaMalloc(&d_data, size * sizeof(int));
160+
161+
// Copy data to device
162+
cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice);
163+
164+
relaxed_xor_kernel<<<1, size>>>(d_data, size);
165+
cudaDeviceSynchronize();
166+
// Copy results back to host
167+
cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
168+
169+
// Free device memory
170+
cudaFree(d_data);
171+
172+
if (h_data[0] != 0x0) {
173+
std::cout << "xor value: " << h_data[0] << std::endl;
174+
std::cout << "relaxed_xor_kernel_test run failed!\n";
175+
exit(-1);
176+
}
177+
std::cout << "relaxed_xor_kernel_test run passed!\n";
178+
}
179+
180+
void relaxed_and_kernel_test(void) {
181+
const int size = 32;
182+
int *d_data, h_data[size];
183+
184+
// Initialize host data
185+
for (int i = 0; i < size; i++) {
186+
h_data[i] = 0xF;
187+
}
188+
189+
// Allocate device memory
190+
cudaMalloc(&d_data, size * sizeof(int));
191+
192+
// Copy data to device
193+
cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice);
194+
195+
relaxed_and_kernel<<<1, size>>>(d_data, size);
196+
cudaDeviceSynchronize();
197+
// Copy results back to host
198+
cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
199+
200+
// Free device memory
201+
cudaFree(d_data);
202+
203+
if (h_data[0] != 0xF) {
204+
std::cout << "and value: " << h_data[0] << std::endl;
205+
std::cout << "relaxed_and_kernel_test run failed!\n";
206+
exit(-1);
207+
}
208+
std::cout << "relaxed_and_kernel_test run passed!\n";
209+
}
210+
211+
void relaxed_max_kernel_test(void) {
212+
const int size = 100;
213+
int *d_data, h_data[size];
214+
215+
// Initialize host data
216+
for (int i = 0; i < size; i++) {
217+
h_data[i] = i;
218+
}
219+
220+
// Allocate device memory
221+
cudaMalloc(&d_data, size * sizeof(float));
222+
223+
// Copy data to device
224+
cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
225+
226+
relaxed_max_kernel<<<1, size>>>(d_data, size);
227+
cudaDeviceSynchronize();
228+
// Copy results back to host
229+
cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
230+
231+
// Free device memory
232+
cudaFree(d_data);
233+
234+
if (h_data[0] != 99) {
235+
std::cout << "max value: " << h_data[0] << std::endl;
236+
std::cout << "relaxed_max_kernel_test run failed!\n";
237+
exit(-1);
238+
}
239+
std::cout << "relaxed_max_kernel_test run passed!\n";
240+
}
241+
242+
void relaxed_min_kernel_test(void) {
243+
const int size = 100;
244+
int *d_data, h_data[size];
245+
246+
// Initialize host data
247+
for (int i = 0; i < size; i++) {
248+
h_data[i] = i;
249+
}
250+
251+
// Allocate device memory
252+
cudaMalloc(&d_data, size * sizeof(float));
253+
254+
// Copy data to device
255+
cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
256+
257+
relaxed_min_kernel<<<1, size>>>(d_data, size);
258+
cudaDeviceSynchronize();
259+
// Copy results back to host
260+
cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
261+
262+
// Free device memory
263+
cudaFree(d_data);
264+
265+
if (h_data[0] != 0) {
266+
std::cout << "min value: " << h_data[0] << std::endl;
267+
std::cout << "relaxed_min_kernel_test run failed!\n";
268+
exit(-1);
269+
}
270+
std::cout << "relaxed_min_kernel_test run passed!\n";
271+
}
272+
273+
int main() {
274+
relaxed_add_kernel_test();
275+
relaxed_or_kernel_test();
276+
relaxed_xor_kernel_test();
277+
relaxed_and_kernel_test();
278+
relaxed_max_kernel_test();
279+
relaxed_min_kernel_test();
280+
281+
return 0;
282+
}

features/features.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
<test testName="asm" configFile="config/TEMPLATE_asm_excluding_syclcompat.xml" />
88
<test testName="asm_bar" configFile="config/TEMPLATE_asm.xml" />
99
<test testName="asm_mem" configFile="config/TEMPLATE_asm.xml" />
10+
<test testName="asm_red" configFile="config/TEMPLATE_asm.xml" />
1011
<test testName="asm_atom" configFile="config/TEMPLATE_asm.xml" />
1112
<test testName="asm_vinst" configFile="config/TEMPLATE_asm_excluding_syclcompat.xml" />
1213
<test testName="asm_v2inst" configFile="config/TEMPLATE_asm_excluding_syclcompat.xml" />

features/test_feature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
'thrust_device_new_delete', 'thrust_temporary_buffer', 'thrust_malloc_free', 'codepin', 'thrust_unique_count',
6464
'thrust_advance_trans_op_itr', 'cuda_stream_query', "matmul", "matmul_2", "matmul_3", "transform", "context_push_n_pop",
6565
"graphics_interop_d3d11", 'graph', 'asm_shfl', 'asm_shfl_sync', 'asm_shfl_sync_with_exp', 'asm_membar_fence',
66-
'cub_block_store']
66+
'cub_block_store', 'asm_red']
6767

6868
occupancy_calculation_exper = ['occupancy_calculation']
6969

0 commit comments

Comments
 (0)