[SYCLomatic #1010] Add test for %laneid, %warpid and WARP_SZ (#368)

Yihan Wang · web-flow · commit 437558feb446 · 2023-06-28T10:09:39.000+08:00
Signed-off-by: Wang, Yihan &lt;yihan.wang@intel.com&gt;
diff --git a/features/feature_case/asm/asm.cu b/features/feature_case/asm/asm.cu
@@ -14,6 +14,7 @@
 // (3) Compound statements.
 // (4) Conditional instructions.
 // (5) Instructions(mov, setp, and lop3).
+// (6) Builtin registers.
 //
 // Usually, we check the result of inline asm statement to ensure that the
 // migrated programe has the same behavior with the inline asmstatement.
@@ -24,6 +25,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cuda_runtime.h>
+#include <map>
 
 #define EPS (1e-6)
 
@@ -244,6 +246,28 @@ __global__ void declaration(int *ec) {
   *ec = 0;
 }
 
+__global__ void builtin(int *ec, int *laneids, int *warpszs, int *warpids) {
+  unsigned laneid, warp_size, warpid;
+  unsigned tid =
+      ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) +
+      (threadIdx.x + (threadIdx.y * blockDim.x));
+  asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
+  asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warp_size));
+  asm volatile("mov.u32 %0, %%warpid;" : "=r"(warpid));
+  unsigned laneid2 = (threadIdx.x & (warpSize - 1));
+  if (laneid != laneid2) {
+    *ec = 1;
+    return;
+  }
+  laneids[tid] = laneid;
+  warpszs[tid] = warp_size;
+  warpids[tid] = warpid;
+  if (tid == 0) {
+    *ec = 0;
+  }
+  // printf("laneid=%u\n", laneid);
+}
+
 __global__ void setp(int *ec) {
   int32_t i32;
   uint32_t u32;
@@ -1410,11 +1434,57 @@ int main() {
   declaration<<<1, 1>>>(d_ec);
   wait_and_check("declaration");
 
+  int *d_warpids, *d_warpszs, *d_laneids;
+  cudaMalloc(&d_laneids, sizeof(int) * 66);
+  cudaMalloc(&d_warpszs, sizeof(int) * 66);
+  cudaMalloc(&d_warpids, sizeof(int) * 66);
+  builtin<<<2, 33>>>(d_ec, d_laneids, d_warpszs, d_warpids);
+  wait_and_check("builtin");
+  int laneids[66] = {0}, warpids[66] = {0}, warpszs[66] = {0};
+  cudaMemcpy(laneids, d_laneids, sizeof(int) * 66, cudaMemcpyDeviceToHost);
+  cudaMemcpy(warpids, d_warpids, sizeof(int) * 66, cudaMemcpyDeviceToHost);
+  cudaMemcpy(warpszs, d_warpszs, sizeof(int) * 66, cudaMemcpyDeviceToHost);
+  std::map<int, int> cnt_laneid, cnt_warpid, cnt_warpsz, cnt_laneid_num;
+  for (int I = 0; I < 66; ++I) {
+    cnt_warpid[warpids[I]]++;
+    cnt_warpsz[warpszs[I]]++;
+    cnt_laneid[laneids[I]]++;
+  }
+
+  int total_warpid = 0;
+  for (const auto &[k, v] : cnt_warpid)
+    total_warpid += v;
+  for (const auto &[k, v] : cnt_laneid)
+    cnt_laneid_num[v]++;
+
+  auto check_laneid_num = [&]() {
+    if (cnt_laneid_num.size() != 2)
+      return false;
+    const auto first = *cnt_laneid_num.begin();
+    const auto second = *std::next(cnt_laneid_num.begin());
+    return first.first + 2 == second.first;
+  };
+
+  cudaMemset(d_ec, !check_laneid_num(), sizeof(int));
+  wait_and_check("builtin");
+
+  cudaMemset(d_ec, total_warpid != 66, sizeof(int));
+  wait_and_check("builtin");
+
+  cudaMemset(d_ec, cnt_warpsz.size() > 2U, sizeof(int));
+  wait_and_check("builtin");
+  cudaMemset(d_ec, 0, sizeof(int));
+  cudaFree(d_warpids);
+  cudaFree(d_laneids);
+  cudaFree(d_warpszs);
+
   setp<<<1, 1>>>(d_ec);
   wait_and_check("setp");
 
   lop3<<<1, 1>>>(d_ec);
   wait_and_check("lop3");
 
+  cudaFree(d_ec);
+
   return ret;
 }