[OpenMP] Replace utilities with 'gpuintrin.h' definitions (#131644)

jhuber6 · web-flow · commit cb493d2bab40 · 2025-03-19T10:47:21.000-05:00
Summary: Port more instructions. AMD version is at https://gist.github.com/jhuber6/235d7ee95f747c75f9a3cfd8eedac6aa
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -14,117 +14,48 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "gpuintrin.h"
 
 using namespace ompx;
 
-namespace impl {
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  static_assert(sizeof(unsigned long) == 8, "");
-  *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
-  *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
-  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
-  int Self = mapping::getThreadIdInWarp();
-  int Index = SrcLane + (Self & ~(Width - 1));
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width) {
-  int Self = mapping::getThreadIdInWarp();
-  int Index = Self + LaneDelta;
-  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
-  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
-  return __builtin_amdgcn_is_shared(
-      (const __attribute__((address_space(0))) void *)Ptr);
-}
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
-  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#endif
-///}
-} // namespace impl
-
 uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
-  return impl::Pack(LowBits, HighBits);
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
 void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
-  impl::Unpack(Val, &LowBits, &HighBits);
+  static_assert(sizeof(unsigned long) == 8, "");
+  LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+  HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
                        int32_t Width) {
-  return impl::shuffle(Mask, Var, SrcLane, Width);
+  return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width);
 }
 
 int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
                            int32_t Width) {
-  return impl::shuffleDown(Mask, Var, Delta, Width);
+  int32_t Self = mapping::getThreadIdInWarp();
+  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
 }
 
 int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
                            int32_t Width) {
-  uint32_t Lo, Hi;
-  utils::unpack(Var, Lo, Hi);
-  Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
-  Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
-  return utils::pack(Lo, Hi);
+  int32_t Self = mapping::getThreadIdInWarp();
+  int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+  return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
 }
 
 uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
-  return impl::ballotSync(Mask, Pred);
+  return __gpu_ballot(Mask, Pred);
 }
 
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
-  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+  return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
 }
 
 int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
@@ -20,41 +20,6 @@
 namespace ompx {
 namespace impl {
 
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-double getWTick() {
-  // The number of ticks per second for the AMDGPU clock varies by card and can
-  // only be retrieved by querying the driver. We rely on the device environment
-  // to inform us what the proper frequency is.
-  return 1.0 / config::getClockFrequency();
-}
-
-double getWTime() {
-  return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
-}
-
-#endif
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-double getWTick() {
-  // Timer precision is 1ns
-  return ((double)1E-9);
-}
-
-double getWTime() {
-  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
-  return static_cast<double>(nsecs) * getWTick();
-}
-
-#endif
-
 /// Lookup a device-side function using a host pointer /p HstPtr using the table
 /// provided by the device plugin. The table is an ordered pair of host and
 /// device pointers sorted on the value of the host pointer.
@@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
 
 int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
 
-double omp_get_wtick(void) { return ompx::impl::getWTick(); }
+double omp_get_wtick(void) {
+  // The number of ticks per second for the AMDGPU clock varies by card and can
+  // only be retrieved by querying the driver. We rely on the device environment
+  // to inform us what the proper frequency is. NVPTX uses a nanosecond
+  // resolution, we could omit the global read but this makes it consistent.
+  return 1.0 / ompx::config::getClockFrequency();
+}
 
-double omp_get_wtime(void) { return ompx::impl::getWTime(); }
+double omp_get_wtime(void) {
+  return static_cast<double>(__builtin_readsteadycounter()) * omp_get_wtick();
+}
 
 void *__llvm_omp_indirect_call_lookup(void *HstPtr) {
   return ompx::impl::indirectCallLookup(HstPtr);