Update names

jhuber6 · jhuber6 · commit 7ebfdf430618 · 2024-11-07T10:12:19.000-06:00
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
@@ -26,14 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 // Type aliases to the address spaces used by the AMDGPU backend.
-#define __private __attribute__((opencl_private))
-#define __constant __attribute__((opencl_constant))
-#define __local __attribute__((opencl_local))
-#define __global __attribute__((opencl_global))
-#define __generic __attribute__((opencl_generic))
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
@@ -113,7 +113,7 @@ _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
 
 // Copies the value from the first active thread in the wavefront to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
@@ -139,7 +139,7 @@ __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the wavefront according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
 }
 
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Provides wrappers around the clang builtins for accessing GPU hardware
+// features. The interface is intended to be portable between architectures, but
+// some targets may provide different implementations. This header can be
+// included for all the common GPU programming languages, namely OpenMP, HIP,
+// CUDA, and OpenCL.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef __GPUINTRIN_H
 #define __GPUINTRIN_H
@@ -13,6 +21,8 @@
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
 #include <amdgpuintrin.h>
+#else
+#error "This header is only meant to be used on GPU architectures."
 #endif
 
 // Returns the total number of blocks / workgroups.
@@ -51,22 +61,22 @@ _DEFAULT_ATTRS static inline bool __gpu_is_first_lane(uint64_t __lane_mask) {
 }
 
 // Gets the sum of all lanes inside the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t __gpu_lane_reduce(uint64_t __lane_mask,
-                                                        uint32_t x) {
+_DEFAULT_ATTRS static inline uint32_t
+__gpu_lane_reduce_u32(uint64_t __lane_mask, uint32_t x) {
   for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) {
     uint32_t index = step + __gpu_lane_id();
-    x += __gpu_shuffle_idx(__lane_mask, index, x);
+    x += __gpu_shuffle_idx_u32(__lane_mask, index, x);
   }
-  return __gpu_broadcast(__lane_mask, x);
+  return __gpu_broadcast_u32(__lane_mask, x);
 }
 
 // Gets the accumulator scan of the threads in the warp or wavefront.
-_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan(uint64_t __lane_mask,
-                                                      uint32_t x) {
+_DEFAULT_ATTRS static inline uint32_t __gpu_lane_scan_u32(uint64_t __lane_mask,
+                                                          uint32_t x) {
   for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) {
     uint32_t index = __gpu_lane_id() - step;
     uint32_t bitmask = __gpu_lane_id() >= step;
-    x += -bitmask & __gpu_shuffle_idx(__lane_mask, index, x);
+    x += -bitmask & __gpu_shuffle_idx_u32(__lane_mask, index, x);
   }
   return x;
 }
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
@@ -26,14 +26,14 @@
 #pragma omp begin declare variant match(device = {arch(nvptx64)})
 
 // Type aliases to the address spaces used by the NVPTX backend.
-#define __private __attribute__((opencl_private))
-#define __constant __attribute__((opencl_constant))
-#define __local __attribute__((opencl_local))
-#define __global __attribute__((opencl_global))
-#define __generic __attribute__((opencl_generic))
+#define __gpu_private __attribute__((opencl_private))
+#define __gpu_constant __attribute__((opencl_constant))
+#define __gpu_local __attribute__((opencl_local))
+#define __gpu_global __attribute__((opencl_global))
+#define __gpu_generic __attribute__((opencl_generic))
 
 // Attribute to declare a function as a kernel.
-#define __kernel __attribute__((amdgpu_kernel, visibility("protected")))
+#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_ATTRS static inline uint32_t __gpu_num_blocks_x() {
@@ -112,7 +112,7 @@ _DEFAULT_ATTRS [[clang::convergent]] static inline uint64_t __gpu_lane_mask() {
 
 // Copies the value from the first active thread in the warp to the rest.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_broadcast(uint64_t __lane_mask, uint32_t __x) {
+__gpu_broadcast_u32(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __id = __builtin_ffs(__mask) - 1;
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
@@ -138,7 +138,7 @@ __gpu_sync_lane(uint64_t __lane_mask) {
 
 // Shuffles the the lanes inside the warp according to the given index.
 _DEFAULT_ATTRS [[clang::convergent]] static inline uint32_t
-__gpu_shuffle_idx(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
   uint32_t __mask = (uint32_t)__lane_mask;
   uint32_t __bitmask = (__mask >> __idx) & 1u;
   return -__bitmask &
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
@@ -493,11 +493,11 @@ void foo() {
   __gpu_num_lanes();
   __gpu_lane_id();
   __gpu_lane_mask();
-  __gpu_broadcast(-1, -1);
+  __gpu_broadcast_u32(-1, -1);
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
-  __gpu_shuffle_idx(-1, -1, -1);
+  __gpu_shuffle_idx_u32(-1, -1, -1);
   __gpu_first_lane_id(-1);
   __gpu_is_first_lane(-1);
   __gpu_exit();