|
14 | 14 | // (3) Compound statements.
|
15 | 15 | // (4) Conditional instructions.
|
16 | 16 | // (5) Instructions(mov, setp, and lop3).
|
| 17 | +// (6) Builtin registers. |
17 | 18 | //
|
18 | 19 | // Usually, we check the result of inline asm statement to ensure that the
|
19 | 20 | // migrated programe has the same behavior with the inline asmstatement.
|
|
24 | 25 | #include <cstdint>
|
25 | 26 | #include <cstdio>
|
26 | 27 | #include <cuda_runtime.h>
|
| 28 | +#include <map> |
27 | 29 |
|
28 | 30 | #define EPS (1e-6)
|
29 | 31 |
|
@@ -244,6 +246,28 @@ __global__ void declaration(int *ec) {
|
244 | 246 | *ec = 0;
|
245 | 247 | }
|
246 | 248 |
|
| 249 | +__global__ void builtin(int *ec, int *laneids, int *warpszs, int *warpids) { |
| 250 | + unsigned laneid, warp_size, warpid; |
| 251 | + unsigned tid = |
| 252 | + ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + |
| 253 | + (threadIdx.x + (threadIdx.y * blockDim.x)); |
| 254 | + asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid)); |
| 255 | + asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warp_size)); |
| 256 | + asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid)); |
| 257 | + unsigned laneid2 = (threadIdx.x & (warpSize - 1)); |
| 258 | + if (laneid != laneid2) { |
| 259 | + *ec = 1; |
| 260 | + return; |
| 261 | + } |
| 262 | + laneids[tid] = laneid; |
| 263 | + warpszs[tid] = warp_size; |
| 264 | + warpids[tid] = warpid; |
| 265 | + if (tid == 0) { |
| 266 | + *ec = 0; |
| 267 | + } |
| 268 | + // printf("laneid=%u\n", laneid); |
| 269 | +} |
| 270 | + |
247 | 271 | __global__ void setp(int *ec) {
|
248 | 272 | int32_t i32;
|
249 | 273 | uint32_t u32;
|
@@ -1410,11 +1434,57 @@ int main() {
|
1410 | 1434 | declaration<<<1, 1>>>(d_ec);
|
1411 | 1435 | wait_and_check("declaration");
|
1412 | 1436 |
|
| 1437 | + int *d_warpids, *d_warpszs, *d_laneids; |
| 1438 | + cudaMalloc(&d_laneids, sizeof(int) * 66); |
| 1439 | + cudaMalloc(&d_warpszs, sizeof(int) * 66); |
| 1440 | + cudaMalloc(&d_warpids, sizeof(int) * 66); |
| 1441 | + builtin<<<2, 33>>>(d_ec, d_laneids, d_warpszs, d_warpids); |
| 1442 | + wait_and_check("builtin"); |
| 1443 | + int laneids[66] = {0}, warpids[66] = {0}, warpszs[66] = {0}; |
| 1444 | + cudaMemcpy(laneids, d_laneids, sizeof(int) * 66, cudaMemcpyDeviceToHost); |
| 1445 | + cudaMemcpy(warpids, d_warpids, sizeof(int) * 66, cudaMemcpyDeviceToHost); |
| 1446 | + cudaMemcpy(warpszs, d_warpszs, sizeof(int) * 66, cudaMemcpyDeviceToHost); |
| 1447 | + std::map<int, int> cnt_laneid, cnt_warpid, cnt_warpsz, cnt_laneid_num; |
| 1448 | + for (int I = 0; I < 66; ++I) { |
| 1449 | + cnt_warpid[warpids[I]]++; |
| 1450 | + cnt_warpsz[warpszs[I]]++; |
| 1451 | + cnt_laneid[laneids[I]]++; |
| 1452 | + } |
| 1453 | + |
| 1454 | + int total_warpid = 0; |
| 1455 | + for (const auto &[k, v] : cnt_warpid) |
| 1456 | + total_warpid += v; |
| 1457 | + for (const auto &[k, v] : cnt_laneid) |
| 1458 | + cnt_laneid_num[v]++; |
| 1459 | + |
| 1460 | + auto check_laneid_num = [&]() { |
| 1461 | + if (cnt_laneid_num.size() != 2) |
| 1462 | + return false; |
| 1463 | + const auto first = *cnt_laneid_num.begin(); |
| 1464 | + const auto second = *std::next(cnt_laneid_num.begin()); |
| 1465 | + return first.first + 2 == second.first; |
| 1466 | + }; |
| 1467 | + |
| 1468 | + cudaMemset(d_ec, !check_laneid_num(), sizeof(int)); |
| 1469 | + wait_and_check("builtin"); |
| 1470 | + |
| 1471 | + cudaMemset(d_ec, total_warpid != 66, sizeof(int)); |
| 1472 | + wait_and_check("builtin"); |
| 1473 | + |
| 1474 | + cudaMemset(d_ec, cnt_warpsz.size() > 2U, sizeof(int)); |
| 1475 | + wait_and_check("builtin"); |
| 1476 | + cudaMemset(d_ec, 0, sizeof(int)); |
| 1477 | + cudaFree(d_warpids); |
| 1478 | + cudaFree(d_laneids); |
| 1479 | + cudaFree(d_warpszs); |
| 1480 | + |
1413 | 1481 | setp<<<1, 1>>>(d_ec);
|
1414 | 1482 | wait_and_check("setp");
|
1415 | 1483 |
|
1416 | 1484 | lop3<<<1, 1>>>(d_ec);
|
1417 | 1485 | wait_and_check("lop3");
|
1418 | 1486 |
|
| 1487 | + cudaFree(d_ec); |
| 1488 | + |
1419 | 1489 | return ret;
|
1420 | 1490 | }
|
0 commit comments