diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp index d8109537832e9..d6f8c499c8904 100644 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ b/offload/DeviceRTL/src/DeviceUtils.cpp @@ -14,117 +14,48 @@ #include "Debug.h" #include "Interface.h" #include "Mapping.h" +#include "gpuintrin.h" using namespace ompx; -namespace impl { - -void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - *LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); - *HighBits = static_cast((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - int Self = mapping::getThreadIdInWarp(); - int Index = SrcLane + (Self & ~(Width - 1)); - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width) { - int Self = mapping::getThreadIdInWarp(); - int Index = Self + LaneDelta; - Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { - return Mask & __builtin_amdgcn_ballot_w64(Pred); -} - -bool isSharedMemPtr(const void *Ptr) { - return __builtin_amdgcn_is_shared( - (const __attribute__((address_space(0))) void *)Ptr); -} -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1); -} - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { - int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f; - return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); -} - -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { - return __nvvm_vote_ballot_sync(static_cast(Mask), Pred); -} - -bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } - -#endif -///} -} // namespace impl - uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return impl::Pack(LowBits, HighBits); + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - impl::Unpack(Val, &LowBits, &HighBits); + static_assert(sizeof(unsigned long) == 8, ""); + LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); + HighBits = static_cast((Val & 0xFFFFFFFF00000000UL) >> 32); } int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - return impl::shuffle(Mask, Var, SrcLane, Width); + return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); } int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { - return impl::shuffleDown(Mask, Var, Delta, Width); + int32_t Self = mapping::getThreadIdInWarp(); + int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; + return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); } int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width) { - uint32_t Lo, Hi; - utils::unpack(Var, Lo, Hi); - Hi = impl::shuffleDown(Mask, Hi, Delta, Width); - Lo = impl::shuffleDown(Mask, Lo, Delta, Width); - return utils::pack(Lo, Hi); + int32_t Self = mapping::getThreadIdInWarp(); + int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; + return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); } uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return impl::ballotSync(Mask, Pred); + return __gpu_ballot(Mask, Pred); } -bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); } +bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } extern "C" { int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return impl::shuffleDown(lanes::All, Val, Delta, SrcLane); + return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); } int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index 734e937f03920..a89f8b2a74531 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -20,41 +20,6 @@ namespace ompx { namespace impl { -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -double getWTick() { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. - return 1.0 / config::getClockFrequency(); -} - -double getWTime() { - return static_cast(__builtin_readsteadycounter()) * getWTick(); -} - -#endif - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -double getWTick() { - // Timer precision is 1ns - return ((double)1E-9); -} - -double getWTime() { - uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer(); - return static_cast(nsecs) * getWTick(); -} - -#endif - /// Lookup a device-side function using a host pointer /p HstPtr using the table /// provided by the device plugin. The table is an ordered pair of host and /// device pointers sorted on the value of the host pointer. @@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } -double omp_get_wtick(void) { return ompx::impl::getWTick(); } +double omp_get_wtick(void) { + // The number of ticks per second for the AMDGPU clock varies by card and can + // only be retrieved by querying the driver. We rely on the device environment + // to inform us what the proper frequency is. NVPTX uses a nanosecond + // resolution, we could omit the global read but this makes it consistent. + return 1.0 / ompx::config::getClockFrequency(); +} -double omp_get_wtime(void) { return ompx::impl::getWTime(); } +double omp_get_wtime(void) { + return static_cast(__builtin_readsteadycounter()) * omp_get_wtick(); +} void *__llvm_omp_indirect_call_lookup(void *HstPtr) { return ompx::impl::indirectCallLookup(HstPtr);