[SYCLomatic #2054] Add test for asm ld and st instrutions (#750)

Yihan Wang · web-flow · commit 4fcf105a0bff · 2024-07-29T15:17:02.000+08:00
Signed-off-by: Wang, Yihan &lt;yihan.wang@intel.com&gt;
diff --git a/features/feature_case/asm/asm_mem.cu b/features/feature_case/asm/asm_mem.cu
@@ -0,0 +1,76 @@
+// ===------- asm_mem.cu ----------------------------------- *- CUDA -* ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void st(int *a) {
+  asm volatile("st.global.s32 [%0], %1;" ::"l"(a), "r"(111));
+  asm volatile("st.global.s32 [%0 + 4], %1;" ::"l"(a), "r"(222));
+  asm volatile("st.global.s32 [%0 + 8], %1;" ::"l"(a), "r"(333));
+  asm volatile("st.global.s32 [%0 + 12], %1;" ::"l"(a), "r"(444));
+}
+
+bool test_store() {
+  int *d_arr = nullptr;
+  cudaMalloc(&d_arr, sizeof(int) * 4);
+  st<<<1, 1>>>(d_arr);
+  cudaStreamSynchronize(0);
+  int h_arr[4], exp[] = {111, 222, 333, 444};
+  cudaMemcpy(h_arr, d_arr, sizeof(h_arr), cudaMemcpyDeviceToHost);
+  cudaFree(d_arr);
+  for (int i = 0; i < 4; ++i)
+    if (h_arr[i] != exp[i])
+      return false;
+  return true;
+}
+
+__global__ void ld(int *arr, int *arr2) {
+  int a, b, c, d;
+  asm volatile("ld.global.s32 %0, [%1];" : "=r"(a) : "l"(arr));
+  asm volatile("ld.global.s32 %0, [%1 + 4];" : "=r"(b) : "l"(arr));
+  asm volatile("ld.global.s32 %0, [%1 + 8];" : "=r"(c) : "l"(arr));
+  asm volatile("ld.global.s32 %0, [%1 + 12];" : "=r"(d) : "l"(arr));
+  asm volatile("st.global.s32 [%0], %1;" ::"l"(arr2), "r"(a));
+  asm volatile("st.global.s32 [%0 + 4], %1;" ::"l"(arr2), "r"(b));
+  asm volatile("st.global.s32 [%0 + 8], %1;" ::"l"(arr2), "r"(c));
+  asm volatile("st.global.s32 [%0 + 12], %1;" ::"l"(arr2), "r"(d));
+}
+
+bool test_load() {
+  int h_arr[4], exp[] = {111, 222, 333, 444};
+  int *d_arr = nullptr, *d_arr2 = nullptr;
+  cudaMalloc(&d_arr, sizeof(int) * 4);
+  cudaMalloc(&d_arr2, sizeof(int) * 4);
+  cudaMemcpy(d_arr, exp, sizeof(exp), cudaMemcpyHostToDevice);
+  ld<<<1, 1>>>(d_arr, d_arr2);
+  cudaStreamSynchronize(0);
+  cudaMemcpy(h_arr, d_arr2, sizeof(h_arr), cudaMemcpyDeviceToHost);
+  for (int i = 0; i < 4; ++i)
+    if (h_arr[i] != exp[i])
+      return false;
+  return true;
+}
+
+#define TEST(FN)                                                               \
+  {                                                                            \
+    if (FN()) {                                                                \
+      printf("Test " #FN " PASS\n");                                           \
+    } else {                                                                   \
+      printf("Test " #FN " FAIL\n");                                           \
+      return 1;                                                                \
+    }                                                                          \
+  }
+
+int main() {
+  TEST(test_store);
+  TEST(test_load);
+
+  return 0;
+}
diff --git a/features/features.xml b/features/features.xml
@@ -6,6 +6,7 @@
   <tests>
     <test testName="asm" configFile="config/TEMPLATE_asm.xml" />
     <test testName="asm_bar" configFile="config/TEMPLATE_asm.xml" />
+    <test testName="asm_mem" configFile="config/TEMPLATE_asm.xml" />
     <test testName="asm_vinst" configFile="config/TEMPLATE_asm.xml" />
     <test testName="asm_v2inst" configFile="config/TEMPLATE_asm.xml" />
     <test testName="asm_v4inst" configFile="config/TEMPLATE_asm.xml" />
diff --git a/features/test_feature.py b/features/test_feature.py
@@ -18,7 +18,7 @@
 
 from test_utils import *
 
-exec_tests = ['asm', 'asm_bar', 'asm_arith', 'asm_vinst', 'asm_v2inst', 'asm_v4inst', 'asm_optimize', 'thrust-vector-2', 'thrust-binary-search', 'thrust-count', 'thrust-copy',
+exec_tests = ['asm', 'asm_bar', 'asm_mem', 'asm_arith', 'asm_vinst', 'asm_v2inst', 'asm_v4inst', 'asm_optimize', 'thrust-vector-2', 'thrust-binary-search', 'thrust-count', 'thrust-copy',
               'thrust-qmc', 'thrust-transform-if', 'thrust-policy', 'thrust-list', 'module-kernel',
               'kernel-launch', 'thrust-gather', 'thrust-gather_if', 'cub_device_partition',
               'thrust-scatter', 'thrust-unique_by_key_copy', 'thrust-for-hypre', 'thrust-merge_by_key',