From 71d68ed01184b833a6f614251177f0ea1da3ad56 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 17 Mar 2025 10:43:40 -0500 Subject: [PATCH 1/2] [OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback. --- offload/DeviceRTL/include/Mapping.h | 6 +- offload/DeviceRTL/src/Mapping.cpp | 255 +++++----------------------- 2 files changed, 49 insertions(+), 212 deletions(-) diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index f892a025159d4..8ba018b5314aa 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -19,9 +19,9 @@ namespace ompx { namespace mapping { enum { - DIM_X = 0, - DIM_Y = 1, - DIM_Z = 2, + DIM_X = __GPU_X_DIM, + DIM_Y = __GPU_Y_DIM, + DIM_Z = __GPU_Z_DIM, }; inline constexpr uint32_t MaxThreadsPerTeam = 1024; diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index 641be81cca3ed..a9e027727b04b 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -14,198 +14,12 @@ #include "DeviceUtils.h" #include "Interface.h" #include "State.h" +#include "gpuintrin.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace ompx; -namespace ompx { -namespace impl { - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } - -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { - switch (Dim) { - case 0: - return __builtin_amdgcn_workgroup_size_x(); - case 1: - return __builtin_amdgcn_workgroup_size_y(); - case 2: - return __builtin_amdgcn_workgroup_size_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } - -LaneMaskTy lanemaskLT() { - uint32_t Lane = mapping::getThreadIdInWarp(); - int64_t Ballot = mapping::activemask(); - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; - return Mask & Ballot; -} - -LaneMaskTy lanemaskGT() { - uint32_t Lane = mapping::getThreadIdInWarp(); - if (Lane == (mapping::getWarpSize() - 1)) - return 0; - int64_t Ballot = mapping::activemask(); - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); - return Mask & Ballot; -} - -uint32_t getThreadIdInWarp() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} - -uint32_t getThreadIdInBlock(int32_t Dim) { - switch (Dim) { - case 0: - return __builtin_amdgcn_workitem_id_x(); - case 1: - return __builtin_amdgcn_workitem_id_y(); - case 2: - return __builtin_amdgcn_workitem_id_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getNumberOfThreadsInKernel() { - return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() * - __builtin_amdgcn_grid_size_z(); -} - -uint32_t getBlockIdInKernel(int32_t Dim) { - switch (Dim) { - case 0: - return __builtin_amdgcn_workgroup_id_x(); - case 1: - return __builtin_amdgcn_workgroup_id_y(); - case 2: - return __builtin_amdgcn_workgroup_id_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { - switch (Dim) { - case 0: - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); - case 1: - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); - case 2: - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getWarpIdInBlock() { - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); -} - -uint32_t getNumberOfWarpsInBlock() { - return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); -} - -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { - switch (Dim) { - case 0: - return __nvvm_read_ptx_sreg_ntid_x(); - case 1: - return __nvvm_read_ptx_sreg_ntid_y(); - case 2: - return __nvvm_read_ptx_sreg_ntid_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); } - -LaneMaskTy activemask() { return __nvvm_activemask(); } - -LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); } - -LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); } - -uint32_t getThreadIdInBlock(int32_t Dim) { - switch (Dim) { - case 0: - return __nvvm_read_ptx_sreg_tid_x(); - case 1: - return __nvvm_read_ptx_sreg_tid_y(); - case 2: - return __nvvm_read_ptx_sreg_tid_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); } - -uint32_t getBlockIdInKernel(int32_t Dim) { - switch (Dim) { - case 0: - return __nvvm_read_ptx_sreg_ctaid_x(); - case 1: - return __nvvm_read_ptx_sreg_ctaid_y(); - case 2: - return __nvvm_read_ptx_sreg_ctaid_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { - switch (Dim) { - case 0: - return __nvvm_read_ptx_sreg_nctaid_x(); - case 1: - return __nvvm_read_ptx_sreg_nctaid_y(); - case 2: - return __nvvm_read_ptx_sreg_nctaid_z(); - }; - UNREACHABLE("Dim outside range!"); -} - -uint32_t getNumberOfThreadsInKernel() { - return impl::getNumberOfThreadsInBlock(0) * - impl::getNumberOfBlocksInKernel(0) * - impl::getNumberOfThreadsInBlock(1) * - impl::getNumberOfBlocksInKernel(1) * - impl::getNumberOfThreadsInBlock(2) * - impl::getNumberOfBlocksInKernel(2); -} - -uint32_t getWarpIdInBlock() { - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); -} - -uint32_t getNumberOfWarpsInBlock() { - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / - mapping::getWarpSize(); -} - -#endif -///} - -} // namespace impl -} // namespace ompx - -/// We have to be deliberate about the distinction of `mapping::` and `impl::` -/// below to avoid repeating assumptions or including irrelevant ones. -///{ - static bool isInLastWarp() { uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & ~(mapping::getWarpSize() - 1); @@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() { return utils::popc(Active & LaneMaskLT) == 0; } -LaneMaskTy mapping::activemask() { return impl::activemask(); } +LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } -LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } +LaneMaskTy mapping::lanemaskLT() { +#ifdef __NVPTX__ + return __nvvm_read_ptx_sreg_lanemask_lt(); +#else + uint32_t Lane = mapping::getThreadIdInWarp(); + int64_t Ballot = mapping::activemask(); + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; + return Mask & Ballot; +#endif +} -LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } +LaneMaskTy mapping::lanemaskGT() { +#ifdef __NVPTX__ + return __nvvm_read_ptx_sreg_lanemask_gt(); +#else + uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + int64_t Ballot = mapping::activemask(); + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); + return Mask & Ballot; +#endif +} uint32_t mapping::getThreadIdInWarp() { - uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); - ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr); + uint32_t ThreadIdInWarp = __gpu_lane_id(); + ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); return ThreadIdInWarp; } uint32_t mapping::getThreadIdInBlock(int32_t Dim) { - uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim); + uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); return ThreadIdInBlock; } -uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } +uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); // If we are in SPMD mode, remove one warp. - return BlockSize - (!IsSPMD * impl::getWarpSize()); + return BlockSize - (!IsSPMD * mapping::getWarpSize()); } uint32_t mapping::getMaxTeamThreads() { return mapping::getMaxTeamThreads(mapping::isSPMDMode()); } uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { - return impl::getNumberOfThreadsInBlock(Dim); + return __gpu_num_threads(Dim); } uint32_t mapping::getNumberOfThreadsInKernel() { - return impl::getNumberOfThreadsInKernel(); + return mapping::getNumberOfThreadsInBlock(0) * + mapping::getNumberOfBlocksInKernel(0) * + mapping::getNumberOfThreadsInBlock(1) * + mapping::getNumberOfBlocksInKernel(1) * + mapping::getNumberOfThreadsInBlock(2) * + mapping::getNumberOfBlocksInKernel(2); } uint32_t mapping::getWarpIdInBlock() { - uint32_t WarpID = impl::getWarpIdInBlock(); - ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr); + uint32_t WarpID = + mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); + ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); return WarpID; } uint32_t mapping::getBlockIdInKernel(int32_t Dim) { - uint32_t BlockId = impl::getBlockIdInKernel(Dim); - ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr); + uint32_t BlockId = __gpu_block_id(Dim); + ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); return BlockId; } uint32_t mapping::getNumberOfWarpsInBlock() { - uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); - ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr); - return NumberOfWarpsInBlocks; + return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); } uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { - uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim); - ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr); - return NumberOfBlocks; + return __gpu_num_blocks(Dim); } uint32_t mapping::getNumberOfProcessorElements() { @@ -326,11 +163,11 @@ extern "C" { } [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { - return impl::getNumberOfThreadsInBlock(mapping::DIM_X); + return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); } [[gnu::noinline]] uint32_t __kmpc_get_warp_size() { - return impl::getWarpSize(); + return mapping::getWarpSize(); } } From 881a69aa5be2cabf82a3e6bb782c1ae629cd98bd Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 17 Mar 2025 11:49:34 -0500 Subject: [PATCH 2/2] [OpenMP] Replace utilities with 'gpuintrin.h' definitions Summary: Follows from the previous one in https://github.com/llvm/llvm-project/pull/131631. --- offload/DeviceRTL/src/DeviceUtils.cpp | 99 ++++----------------------- offload/DeviceRTL/src/Misc.cpp | 47 +++---------- 2 files changed, 25 insertions(+), 121 deletions(-) diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp index d8109537832e9..d6f8c499c8904 100644 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ b/offload/DeviceRTL/src/DeviceUtils.cpp @@ -14,117 +14,48 @@ #include "Debug.h" #include "Interface.h" #include "Mapping.h" +#include "gpuintrin.h" using namespace ompx; -namespace impl { - -void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { - static_assert(sizeof(unsigned long) == 8, ""); - *LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); - *HighBits = static_cast((Val & 0xFFFFFFFF00000000UL) >> 32); -} - -uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { - return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; -} - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width); - -uint64_t ballotSync(uint64_t Mask, int32_t Pred); - -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - int Self = mapping::getThreadIdInWarp(); - int Index = SrcLane + (Self & ~(Width - 1)); - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width) { - int Self = mapping::getThreadIdInWarp(); - int Index = Self + LaneDelta; - Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; - return __builtin_amdgcn_ds_bpermute(Index << 2, Var); -} - -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { - return Mask & __builtin_amdgcn_ballot_w64(Pred); -} - -bool isSharedMemPtr(const void *Ptr) { - return __builtin_amdgcn_is_shared( - (const __attribute__((address_space(0))) void *)Ptr); -} -#endif -///} - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1); -} - -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { - int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f; - return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); -} - -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { - return __nvvm_vote_ballot_sync(static_cast(Mask), Pred); -} - -bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } - -#endif -///} -} // namespace impl - uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { - return impl::Pack(LowBits, HighBits); + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { - impl::Unpack(Val, &LowBits, &HighBits); + static_assert(sizeof(unsigned long) == 8, ""); + LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); + HighBits = static_cast((Val & 0xFFFFFFFF00000000UL) >> 32); } int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { - return impl::shuffle(Mask, Var, SrcLane, Width); + return __gpu_shuffle_idx_u32(Mask, SrcLane, Var, Width); } int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { - return impl::shuffleDown(Mask, Var, Delta, Width); + int32_t Self = mapping::getThreadIdInWarp(); + int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; + return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); } int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width) { - uint32_t Lo, Hi; - utils::unpack(Var, Lo, Hi); - Hi = impl::shuffleDown(Mask, Hi, Delta, Width); - Lo = impl::shuffleDown(Mask, Lo, Delta, Width); - return utils::pack(Lo, Hi); + int32_t Self = mapping::getThreadIdInWarp(); + int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta; + return __gpu_shuffle_idx_u64(Mask, Index, Var, Width); } uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { - return impl::ballotSync(Mask, Pred); + return __gpu_ballot(Mask, Pred); } -bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); } +bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); } extern "C" { int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { - return impl::shuffleDown(lanes::All, Val, Delta, SrcLane); + return utils::shuffleDown(lanes::All, Val, Delta, SrcLane); } int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index 734e937f03920..a89f8b2a74531 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -20,41 +20,6 @@ namespace ompx { namespace impl { -/// AMDGCN Implementation -/// -///{ -#ifdef __AMDGPU__ - -double getWTick() { - // The number of ticks per second for the AMDGPU clock varies by card and can - // only be retrieved by querying the driver. We rely on the device environment - // to inform us what the proper frequency is. - return 1.0 / config::getClockFrequency(); -} - -double getWTime() { - return static_cast(__builtin_readsteadycounter()) * getWTick(); -} - -#endif - -/// NVPTX Implementation -/// -///{ -#ifdef __NVPTX__ - -double getWTick() { - // Timer precision is 1ns - return ((double)1E-9); -} - -double getWTime() { - uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer(); - return static_cast(nsecs) * getWTick(); -} - -#endif - /// Lookup a device-side function using a host pointer /p HstPtr using the table /// provided by the device plugin. The table is an ordered pair of host and /// device pointers sorted on the value of the host pointer. @@ -112,9 +77,17 @@ int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } -double omp_get_wtick(void) { return ompx::impl::getWTick(); } +double omp_get_wtick(void) { + // The number of ticks per second for the AMDGPU clock varies by card and can + // only be retrieved by querying the driver. We rely on the device environment + // to inform us what the proper frequency is. NVPTX uses a nanosecond + // resolution, we could omit the global read but this makes it consistent. + return 1.0 / ompx::config::getClockFrequency(); +} -double omp_get_wtime(void) { return ompx::impl::getWTime(); } +double omp_get_wtime(void) { + return static_cast(__builtin_readsteadycounter()) * omp_get_wtick(); +} void *__llvm_omp_indirect_call_lookup(void *HstPtr) { return ompx::impl::indirectCallLookup(HstPtr);