Skip to content

Conversation

@jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Mar 17, 2025

Summary:
This patch ports the runtime to use gpuintrin.h instead of calling the
builtins for most things. The lanemask_gt stuff was left for now with
a fallback.

AMD version for Ron https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226.

Summary:
This patch ports the runtime to use `gpuintrin.h` instead of calling the
builtins for most things. The `lanemask_gt` stuff was left for now with
a fallback.
@llvmbot
Copy link
Member

llvmbot commented Mar 17, 2025

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

Changes

Summary:
This patch ports the runtime to use gpuintrin.h instead of calling the
builtins for most things. The lanemask_gt stuff was left for now with
a fallback.


Full diff: https://github.com/llvm/llvm-project/pull/131631.diff

2 Files Affected:

  • (modified) offload/DeviceRTL/include/Mapping.h (+3-3)
  • (modified) offload/DeviceRTL/src/Mapping.cpp (+46-209)
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index f892a025159d4..8ba018b5314aa 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -19,9 +19,9 @@ namespace ompx {
 namespace mapping {
 
 enum {
-  DIM_X = 0,
-  DIM_Y = 1,
-  DIM_Z = 2,
+  DIM_X = __GPU_X_DIM,
+  DIM_Y = __GPU_Y_DIM,
+  DIM_Z = __GPU_Z_DIM,
 };
 
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..a9e027727b04b 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -14,198 +14,12 @@
 #include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
+#include "gpuintrin.h"
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
 
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workitem_id_x();
-  case 1:
-    return __builtin_amdgcn_workitem_id_y();
-  case 2:
-    return __builtin_amdgcn_workitem_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
-         __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_id_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_id_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ntid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ntid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ntid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_tid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_tid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_tid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_nctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_nctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_nctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInBlock(0) *
-         impl::getNumberOfBlocksInKernel(0) *
-         impl::getNumberOfThreadsInBlock(1) *
-         impl::getNumberOfBlocksInKernel(1) *
-         impl::getNumberOfThreadsInBlock(2) *
-         impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
 static bool isInLastWarp() {
   uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
                      ~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
   return utils::popc(Active & LaneMaskLT) == 0;
 }
 
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
 
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+#endif
+}
 
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+#endif
+}
 
 uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
-  ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+  uint32_t ThreadIdInWarp = __gpu_lane_id();
+  ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
   return ThreadIdInWarp;
 }
 
 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
   return ThreadIdInBlock;
 }
 
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
 
 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
   uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
   // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  return BlockSize - (!IsSPMD * mapping::getWarpSize());
 }
 uint32_t mapping::getMaxTeamThreads() {
   return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
 
 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return impl::getNumberOfThreadsInBlock(Dim);
+  return __gpu_num_threads(Dim);
 }
 
 uint32_t mapping::getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInKernel();
+  return mapping::getNumberOfThreadsInBlock(0) *
+         mapping::getNumberOfBlocksInKernel(0) *
+         mapping::getNumberOfThreadsInBlock(1) *
+         mapping::getNumberOfBlocksInKernel(1) *
+         mapping::getNumberOfThreadsInBlock(2) *
+         mapping::getNumberOfBlocksInKernel(2);
 }
 
 uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID = impl::getWarpIdInBlock();
-  ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
+  uint32_t WarpID =
+      mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
+  ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
   return WarpID;
 }
 
 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
-  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
+  uint32_t BlockId = __gpu_block_id(Dim);
+  ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
   return BlockId;
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
-  uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
-  return NumberOfWarpsInBlocks;
+  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
 }
 
 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
-  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
-  return NumberOfBlocks;
+  return __gpu_num_blocks(Dim);
 }
 
 uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return impl::getWarpSize();
+  return mapping::getWarpSize();
 }
 }
 

@jhuber6
Copy link
Contributor Author

jhuber6 commented Mar 17, 2025

jhuber6 added a commit to jhuber6/llvm-project that referenced this pull request Mar 17, 2025
Copy link
Collaborator

@JonChesterfield JonChesterfield left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love it! Very like #131907 except you implemented more of it in the first pass. Ship it

@jhuber6 jhuber6 merged commit 206f78d into llvm:main Mar 18, 2025
11 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants