-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback.
|
@llvm/pr-subscribers-offload Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/131631.diff 2 Files Affected:
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index f892a025159d4..8ba018b5314aa 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -19,9 +19,9 @@ namespace ompx {
namespace mapping {
enum {
- DIM_X = 0,
- DIM_Y = 1,
- DIM_Z = 2,
+ DIM_X = __GPU_X_DIM,
+ DIM_Y = __GPU_Y_DIM,
+ DIM_Z = __GPU_Z_DIM,
};
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..a9e027727b04b 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -14,198 +14,12 @@
#include "DeviceUtils.h"
#include "Interface.h"
#include "State.h"
+#include "gpuintrin.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace ompx;
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
- return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- if (Lane == (mapping::getWarpSize() - 1))
- return 0;
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
- return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workitem_id_x();
- case 1:
- return __builtin_amdgcn_workitem_id_y();
- case 2:
- return __builtin_amdgcn_workitem_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
- __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_id_x();
- case 1:
- return __builtin_amdgcn_workgroup_id_y();
- case 2:
- return __builtin_amdgcn_workgroup_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ntid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ntid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ntid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_tid_x();
- case 1:
- return __nvvm_read_ptx_sreg_tid_y();
- case 2:
- return __nvvm_read_ptx_sreg_tid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_nctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_nctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_nctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInBlock(0) *
- impl::getNumberOfBlocksInKernel(0) *
- impl::getNumberOfThreadsInBlock(1) *
- impl::getNumberOfBlocksInKernel(1) *
- impl::getNumberOfThreadsInBlock(2) *
- impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
static bool isInLastWarp() {
uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
return utils::popc(Active & LaneMaskLT) == 0;
}
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+ uint32_t Lane = mapping::getThreadIdInWarp();
+ int64_t Ballot = mapping::activemask();
+ uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+ return Mask & Ballot;
+#endif
+}
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+ uint32_t Lane = mapping::getThreadIdInWarp();
+ if (Lane == (mapping::getWarpSize() - 1))
+ return 0;
+ int64_t Ballot = mapping::activemask();
+ uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+ return Mask & Ballot;
+#endif
+}
uint32_t mapping::getThreadIdInWarp() {
- uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
- ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+ uint32_t ThreadIdInWarp = __gpu_lane_id();
+ ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
return ThreadIdInWarp;
}
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
- uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+ uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
return ThreadIdInBlock;
}
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
// If we are in SPMD mode, remove one warp.
- return BlockSize - (!IsSPMD * impl::getWarpSize());
+ return BlockSize - (!IsSPMD * mapping::getWarpSize());
}
uint32_t mapping::getMaxTeamThreads() {
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
- return impl::getNumberOfThreadsInBlock(Dim);
+ return __gpu_num_threads(Dim);
}
uint32_t mapping::getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInKernel();
+ return mapping::getNumberOfThreadsInBlock(0) *
+ mapping::getNumberOfBlocksInKernel(0) *
+ mapping::getNumberOfThreadsInBlock(1) *
+ mapping::getNumberOfBlocksInKernel(1) *
+ mapping::getNumberOfThreadsInBlock(2) *
+ mapping::getNumberOfBlocksInKernel(2);
}
uint32_t mapping::getWarpIdInBlock() {
- uint32_t WarpID = impl::getWarpIdInBlock();
- ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
+ uint32_t WarpID =
+ mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
+ ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
return WarpID;
}
uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
- uint32_t BlockId = impl::getBlockIdInKernel(Dim);
- ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
+ uint32_t BlockId = __gpu_block_id(Dim);
+ ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
return BlockId;
}
uint32_t mapping::getNumberOfWarpsInBlock() {
- uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
- ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
- return NumberOfWarpsInBlocks;
+ return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
+ mapping::getWarpSize();
}
uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
- uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
- ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
- return NumberOfBlocks;
+ return __gpu_num_blocks(Dim);
}
uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
}
[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
- return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+ return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
}
[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
- return impl::getWarpSize();
+ return mapping::getWarpSize();
}
}
|
Summary: Follows from the previous one in llvm#131631.
JonChesterfield
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I love it! Very like #131907 except you implemented more of it in the first pass. Ship it
Summary:
This patch ports the runtime to use
gpuintrin.hinstead of calling thebuiltins for most things. The
lanemask_gtstuff was left for now witha fallback.
AMD version for Ron https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226.