Skip to content

Commit 437558f

Browse files
author
Yihan Wang
authored
[SYCLomatic #1010] Add test for %laneid, %warpid and WARP_SZ (#368)
Signed-off-by: Wang, Yihan <[email protected]>
1 parent 20b49f7 commit 437558f

File tree

1 file changed

+70
-0
lines changed
  • features/feature_case/asm

1 file changed

+70
-0
lines changed

features/feature_case/asm/asm.cu

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// (3) Compound statements.
1515
// (4) Conditional instructions.
1616
// (5) Instructions(mov, setp, and lop3).
17+
// (6) Builtin registers.
1718
//
1819
// Usually, we check the result of inline asm statement to ensure that the
1920
// migrated programe has the same behavior with the inline asmstatement.
@@ -24,6 +25,7 @@
2425
#include <cstdint>
2526
#include <cstdio>
2627
#include <cuda_runtime.h>
28+
#include <map>
2729

2830
#define EPS (1e-6)
2931

@@ -244,6 +246,28 @@ __global__ void declaration(int *ec) {
244246
*ec = 0;
245247
}
246248

249+
__global__ void builtin(int *ec, int *laneids, int *warpszs, int *warpids) {
250+
unsigned laneid, warp_size, warpid;
251+
unsigned tid =
252+
((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) +
253+
(threadIdx.x + (threadIdx.y * blockDim.x));
254+
asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
255+
asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warp_size));
256+
asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
257+
unsigned laneid2 = (threadIdx.x & (warpSize - 1));
258+
if (laneid != laneid2) {
259+
*ec = 1;
260+
return;
261+
}
262+
laneids[tid] = laneid;
263+
warpszs[tid] = warp_size;
264+
warpids[tid] = warpid;
265+
if (tid == 0) {
266+
*ec = 0;
267+
}
268+
// printf("laneid=%u\n", laneid);
269+
}
270+
247271
__global__ void setp(int *ec) {
248272
int32_t i32;
249273
uint32_t u32;
@@ -1410,11 +1434,57 @@ int main() {
14101434
declaration<<<1, 1>>>(d_ec);
14111435
wait_and_check("declaration");
14121436

1437+
int *d_warpids, *d_warpszs, *d_laneids;
1438+
cudaMalloc(&d_laneids, sizeof(int) * 66);
1439+
cudaMalloc(&d_warpszs, sizeof(int) * 66);
1440+
cudaMalloc(&d_warpids, sizeof(int) * 66);
1441+
builtin<<<2, 33>>>(d_ec, d_laneids, d_warpszs, d_warpids);
1442+
wait_and_check("builtin");
1443+
int laneids[66] = {0}, warpids[66] = {0}, warpszs[66] = {0};
1444+
cudaMemcpy(laneids, d_laneids, sizeof(int) * 66, cudaMemcpyDeviceToHost);
1445+
cudaMemcpy(warpids, d_warpids, sizeof(int) * 66, cudaMemcpyDeviceToHost);
1446+
cudaMemcpy(warpszs, d_warpszs, sizeof(int) * 66, cudaMemcpyDeviceToHost);
1447+
std::map<int, int> cnt_laneid, cnt_warpid, cnt_warpsz, cnt_laneid_num;
1448+
for (int I = 0; I < 66; ++I) {
1449+
cnt_warpid[warpids[I]]++;
1450+
cnt_warpsz[warpszs[I]]++;
1451+
cnt_laneid[laneids[I]]++;
1452+
}
1453+
1454+
int total_warpid = 0;
1455+
for (const auto &[k, v] : cnt_warpid)
1456+
total_warpid += v;
1457+
for (const auto &[k, v] : cnt_laneid)
1458+
cnt_laneid_num[v]++;
1459+
1460+
auto check_laneid_num = [&]() {
1461+
if (cnt_laneid_num.size() != 2)
1462+
return false;
1463+
const auto first = *cnt_laneid_num.begin();
1464+
const auto second = *std::next(cnt_laneid_num.begin());
1465+
return first.first + 2 == second.first;
1466+
};
1467+
1468+
cudaMemset(d_ec, !check_laneid_num(), sizeof(int));
1469+
wait_and_check("builtin");
1470+
1471+
cudaMemset(d_ec, total_warpid != 66, sizeof(int));
1472+
wait_and_check("builtin");
1473+
1474+
cudaMemset(d_ec, cnt_warpsz.size() > 2U, sizeof(int));
1475+
wait_and_check("builtin");
1476+
cudaMemset(d_ec, 0, sizeof(int));
1477+
cudaFree(d_warpids);
1478+
cudaFree(d_laneids);
1479+
cudaFree(d_warpszs);
1480+
14131481
setp<<<1, 1>>>(d_ec);
14141482
wait_and_check("setp");
14151483

14161484
lop3<<<1, 1>>>(d_ec);
14171485
wait_and_check("lop3");
14181486

1487+
cudaFree(d_ec);
1488+
14191489
return ret;
14201490
}

0 commit comments

Comments
 (0)