Add more functions

jhuber6 · jhuber6 · commit d48b335ad8da · 2024-11-11T08:50:30.000-06:00
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
@@ -158,11 +158,28 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
          ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
 }
 
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __builtin_amdgcn_is_shared(
+      (void __attribute__((address_space(0))) *)ptr);
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __builtin_amdgcn_is_private(
+      (void __attribute__((address_space(0))) *)ptr);
+}
+
 // Terminates execution of the associated wavefront.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __builtin_amdgcn_endpgm();
 }
 
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  __builtin_amdgcn_s_sleep(2);
+}
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
@@ -148,34 +148,35 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_REDUCE(__type, __suffix)                                     \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix(     \
-      uint64_t __lane_mask, __type x) {                                        \
-    for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {         \
-      uint32_t index = step + __gpu_lane_id();                                 \
-      x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x);                \
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
+      uint32_t __index = __step + __gpu_lane_id();                             \
+      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x);          \
     }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, x);                   \
+    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
   }
-__DO_LANE_REDUCE(uint32_t, u32); // uint32_t __gpu_lane_reduce_u32(m, x)
-__DO_LANE_REDUCE(uint64_t, u64); // uint64_t __gpu_lane_reduce_u64(m, x)
-__DO_LANE_REDUCE(float, f32);    // float __gpu_lane_reduce_f32(m, x)
-__DO_LANE_REDUCE(double, f64);   // double __gpu_lane_reduce_f64(m, x)
-#undef __DO_LANE_REDUCE
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
-      uint64_t __lane_mask, uint32_t x) {                                      \
-    for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {             \
-      uint32_t index = __gpu_lane_id() - step;                                 \
-      __bitmask_type bitmask = __gpu_lane_id() >= step;                        \
-      x += __builtin_bit_cast(                                                 \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, index, x)));  \
+      uint64_t __lane_mask, uint32_t __x) {                                    \
+    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
+      uint32_t __index = __gpu_lane_id() - __step;                             \
+      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
+      __x += __builtin_bit_cast(                                               \
+          __type,                                                              \
+          -bitmask & __builtin_bit_cast(__bitmask_type,                        \
+                                        __gpu_shuffle_idx_##__suffix(          \
+                                            __lane_mask, __index, __x)));      \
     }                                                                          \
-    return x;                                                                  \
+    return __x;                                                                \
   }
 __DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x)
 __DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x)
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
@@ -170,11 +170,27 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
                                              __gpu_num_lanes() - 1u));
 }
 
+// Returns true if the flat pointer points to CUDA 'shared' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
+  return __nvvm_isspacep_shared(ptr);
+}
+
+// Returns true if the flat pointer points to CUDA 'local' memory.
+_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
+  return __nvvm_isspacep_local(ptr);
+}
+
 // Terminates execution of the calling thread.
 _DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
   __nvvm_exit();
 }
 
+// Suspend the thread briefly to assist the scheduler during busy loops.
+_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    asm("nanosleep.u32 64;" ::: "memory");
+}
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
@@ -13,7 +13,7 @@
 //
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
 // RUN:   -internal-isystem %S/../../lib/Headers/ \
-// RUN:   -cl-std=CL3.0 -triple amdgcn -emit-llvm %s -o - \
+// RUN:   -cl-std=CL2.0 -triple amdgcn -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefix=OPENCL
 //
 // RUN: %clang_cc1 -internal-isystem %S/Inputs/include \