Skip to content

Commit a25dafc

Browse files
authored
[SYCLomatic #2672, #2588] Add E2E test for migration of PTX cp.sync instruction (#877)
Signed-off-by: chenwei.sun <[email protected]>
1 parent 991ef37 commit a25dafc

File tree

3 files changed

+170
-1
lines changed

3 files changed

+170
-1
lines changed

features/feature_case/asm/asm_cp.cu

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// ====------ asm_cp.cu ----------------------------------- *- CUDA -* ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//
8+
// ===---------------------------------------------------------------------===//
9+
10+
#include <cuda.h>
11+
#include <cuda_runtime.h>
12+
#include <iostream>
13+
14+
#define TEST(FN) \
15+
{ \
16+
if (FN()) { \
17+
printf("Test " #FN " PASS\n"); \
18+
} else { \
19+
printf("Test " #FN " FAIL\n"); \
20+
return 1; \
21+
} \
22+
}
23+
24+
__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
25+
bool pred = true) {
26+
const int BYTES = 16;
27+
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
28+
asm volatile("{\n"
29+
" .reg .pred p;\n"
30+
" setp.ne.b32 p, %0, 0;\n"
31+
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
32+
"}\n"
33+
:
34+
: "r"((int)pred), "r"(smem), "l"(glob_ptr), "n"(BYTES));
35+
}
36+
37+
__global__ void test_cp_async4_pred(int4 *d_out, int4 *d_in) {
38+
extern __shared__ int4 smem[];
39+
int tid = threadIdx.x;
40+
41+
if (tid % 2) {
42+
cp_async4_pred(&smem[tid], &d_in[tid], true);
43+
44+
asm volatile("cp.async.commit_group;" ::: "memory");
45+
asm volatile("cp.async.wait_all;" ::: "memory");
46+
47+
__syncthreads();
48+
d_out[tid] = smem[tid];
49+
}
50+
}
51+
52+
bool cp_async4_pred_test() {
53+
const int N = 256;
54+
size_t size = N * sizeof(int4);
55+
56+
// Allocate host memory
57+
int4 *h_in = (int4 *)malloc(size);
58+
int4 *h_out = (int4 *)malloc(size);
59+
60+
for (int i = 0; i < N; i++) {
61+
h_in[i] = make_int4(i, i + 1, i + 2, i + 3);
62+
}
63+
64+
int4 *d_in, *d_out;
65+
cudaMalloc(&d_in, size);
66+
cudaMalloc(&d_out, size);
67+
68+
// Copy input data to device
69+
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
70+
71+
// Launch kernel
72+
test_cp_async4_pred<<<1, N, size>>>(d_out, d_in);
73+
cudaDeviceSynchronize();
74+
75+
// Copy output data back to host
76+
cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
77+
78+
bool passed = true;
79+
for (int i = 0; i < N; i++) {
80+
if (i % 2 && (h_out[i].x != h_in[i].x || h_out[i].y != h_in[i].y ||
81+
h_out[i].z != h_in[i].z || h_out[i].w != h_in[i].w)) {
82+
83+
passed = false;
84+
std::cout << "Mismatch at index " << i << "\n";
85+
break;
86+
}
87+
}
88+
89+
free(h_in);
90+
free(h_out);
91+
cudaFree(d_in);
92+
cudaFree(d_out);
93+
return passed;
94+
}
95+
96+
__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
97+
const int BYTES = 16;
98+
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
99+
asm volatile("{\n"
100+
" cp.async.cg.shared.global [%0], [%1], %2;\n"
101+
"}\n" ::"r"(smem),
102+
"l"(glob_ptr), "n"(BYTES));
103+
}
104+
105+
__global__ void test_cp_async4(int4 *d_out, int4 *d_in) {
106+
extern __shared__ int4 smem[];
107+
int tid = threadIdx.x;
108+
109+
// Perform async copy
110+
cp_async4(&smem[tid], &d_in[tid]);
111+
112+
// Ensure all async copies are completed before reading
113+
asm volatile("cp.async.commit_group;" ::: "memory");
114+
asm volatile("cp.async.wait_all;" ::: "memory");
115+
__syncthreads();
116+
117+
// Store the result back to global memory for verification
118+
d_out[tid] = smem[tid];
119+
}
120+
121+
bool cp_async4_test() {
122+
const int N = 256;
123+
size_t size = N * sizeof(int4);
124+
125+
// Allocate host memory
126+
int4 *h_in = (int4 *)malloc(size);
127+
int4 *h_out = (int4 *)malloc(size);
128+
129+
for (int i = 0; i < N; i++) {
130+
h_in[i] = make_int4(i, i + 1, i + 2, i + 3);
131+
}
132+
133+
int4 *d_in, *d_out;
134+
cudaMalloc(&d_in, size);
135+
cudaMalloc(&d_out, size);
136+
137+
// Copy input data to device
138+
cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
139+
140+
// Launch kernel
141+
test_cp_async4<<<1, N, size>>>(d_out, d_in);
142+
cudaDeviceSynchronize();
143+
144+
// Copy output data back to host
145+
cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
146+
147+
bool passed = true;
148+
for (int i = 0; i < N; i++) {
149+
if (h_out[i].x != h_in[i].x || h_out[i].y != h_in[i].y ||
150+
h_out[i].z != h_in[i].z || h_out[i].w != h_in[i].w) {
151+
passed = false;
152+
std::cout << "Mismatch at index " << i << "\n";
153+
break;
154+
}
155+
}
156+
157+
free(h_in);
158+
free(h_out);
159+
cudaFree(d_in);
160+
cudaFree(d_out);
161+
return passed;
162+
}
163+
164+
int main() {
165+
TEST(cp_async4_pred_test);
166+
TEST(cp_async4_test);
167+
return 0;
168+
}

features/features.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<test testName="asm_membar_fence" configFile="config/TEMPLATE_asm_excluding_syclcompat.xml" />
1919
<test testName="asm_arith" configFile="config/TEMPLATE_asm.xml" />
2020
<test testName="asm_optimize" configFile="config/TEMPLATE_asm.xml" />
21+
<test testName="asm_cp" configFile="config/TEMPLATE_asm.xml" />
2122
<test testName="thrust-op" configFile="config/TEMPLATE_thrust_op.xml" />
2223
<test testName="thrust-for-h2o4gpu" configFile="config/TEMPLATE_thrust_api.xml" />
2324
<test testName="thrust-vector" configFile="config/TEMPLATE_thrust_api.xml" />

features/test_feature.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
'thrust_device_new_delete', 'thrust_temporary_buffer', 'thrust_malloc_free', 'codepin', 'thrust_unique_count',
6464
'thrust_advance_trans_op_itr', 'cuda_stream_query', "matmul", "matmul_2", "matmul_3", "transform", "context_push_n_pop",
6565
"graphics_interop_d3d11", 'graph', 'asm_shfl', 'asm_shfl_sync', 'asm_shfl_sync_with_exp', 'asm_membar_fence',
66-
'cub_block_store', 'asm_red']
66+
'cub_block_store', 'asm_red', 'asm_cp']
6767

6868
occupancy_calculation_exper = ['occupancy_calculation']
6969

0 commit comments

Comments
 (0)