[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631

jhuber6 · 2025-03-17T15:45:33Z

Summary:
This patch ports the runtime to use gpuintrin.h instead of calling the
builtins for most things. The lanemask_gt stuff was left for now with
a fallback.

AMD version for Ron https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226.

Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback.

llvmbot · 2025-03-17T15:46:12Z

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

Changes

Summary:
This patch ports the runtime to use gpuintrin.h instead of calling the
builtins for most things. The lanemask_gt stuff was left for now with
a fallback.

Full diff: https://github.com/llvm/llvm-project/pull/131631.diff

2 Files Affected:

(modified) offload/DeviceRTL/include/Mapping.h (+3-3)
(modified) offload/DeviceRTL/src/Mapping.cpp (+46-209)

diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index f892a025159d4..8ba018b5314aa 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -19,9 +19,9 @@ namespace ompx {
 namespace mapping {
 
 enum {
-  DIM_X = 0,
-  DIM_Y = 1,
-  DIM_Z = 2,
+  DIM_X = __GPU_X_DIM,
+  DIM_Y = __GPU_Y_DIM,
+  DIM_Z = __GPU_Z_DIM,
 };
 
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 641be81cca3ed..a9e027727b04b 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -14,198 +14,12 @@
 #include "DeviceUtils.h"
 #include "Interface.h"
 #include "State.h"
+#include "gpuintrin.h"
 
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
 
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workitem_id_x();
-  case 1:
-    return __builtin_amdgcn_workitem_id_y();
-  case 2:
-    return __builtin_amdgcn_workitem_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
-         __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_id_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_id_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ntid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ntid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ntid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_tid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_tid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_tid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_nctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_nctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_nctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInBlock(0) *
-         impl::getNumberOfBlocksInKernel(0) *
-         impl::getNumberOfThreadsInBlock(1) *
-         impl::getNumberOfBlocksInKernel(1) *
-         impl::getNumberOfThreadsInBlock(2) *
-         impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
-///{
-
 static bool isInLastWarp() {
   uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
                      ~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
   return utils::popc(Active & LaneMaskLT) == 0;
 }
 
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
 
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+#endif
+}
 
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+#endif
+}
 
 uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
-  ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+  uint32_t ThreadIdInWarp = __gpu_lane_id();
+  ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
   return ThreadIdInWarp;
 }
 
 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
   return ThreadIdInBlock;
 }
 
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
 
 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
   uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
   // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  return BlockSize - (!IsSPMD * mapping::getWarpSize());
 }
 uint32_t mapping::getMaxTeamThreads() {
   return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
 
 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return impl::getNumberOfThreadsInBlock(Dim);
+  return __gpu_num_threads(Dim);
 }
 
 uint32_t mapping::getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInKernel();
+  return mapping::getNumberOfThreadsInBlock(0) *
+         mapping::getNumberOfBlocksInKernel(0) *
+         mapping::getNumberOfThreadsInBlock(1) *
+         mapping::getNumberOfBlocksInKernel(1) *
+         mapping::getNumberOfThreadsInBlock(2) *
+         mapping::getNumberOfBlocksInKernel(2);
 }
 
 uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID = impl::getWarpIdInBlock();
-  ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
+  uint32_t WarpID =
+      mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
+  ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
   return WarpID;
 }
 
 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
-  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
+  uint32_t BlockId = __gpu_block_id(Dim);
+  ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
   return BlockId;
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
-  uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
-  return NumberOfWarpsInBlocks;
+  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
 }
 
 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
-  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
-  return NumberOfBlocks;
+  return __gpu_num_blocks(Dim);
 }
 
 uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return impl::getWarpSize();
+  return mapping::getWarpSize();
 }
 }

jhuber6 · 2025-03-17T15:49:41Z

@ronlieb AMD version https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226

Summary: Follows from the previous one in llvm#131631.

JonChesterfield

I love it! Very like #131907 except you implemented more of it in the first pass. Ship it

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers

71d68ed

Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback.

jhuber6 requested review from JonChesterfield, arsenm, jdoerfert, jplehr, saiislam and shiltian March 17, 2025 15:45

llvmbot added the offload label Mar 17, 2025

jhuber6 added a commit to jhuber6/llvm-project that referenced this pull request Mar 17, 2025

[OpenMP] Replace utilities with 'gpuintrin.h' definitions

881a69a

Summary: Follows from the previous one in llvm#131631.

jhuber6 mentioned this pull request Mar 18, 2025

[openmp][nfc] Use clang gpuintrin for some dispatch to target intrinsics #131907

Closed

JonChesterfield approved these changes Mar 18, 2025

View reviewed changes

JonChesterfield mentioned this pull request Mar 18, 2025

[OpenMP] Replace utilities with 'gpuintrin.h' definitions #131644

Merged

jhuber6 merged commit 206f78d into llvm:main Mar 18, 2025
11 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631

Uh oh!

jhuber6 commented Mar 17, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 17, 2025

Uh oh!

jhuber6 commented Mar 17, 2025

Uh oh!

JonChesterfield left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631

[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers #131631

Uh oh!

Conversation

jhuber6 commented Mar 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 17, 2025

Uh oh!

jhuber6 commented Mar 17, 2025

Uh oh!

JonChesterfield left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

jhuber6 commented Mar 17, 2025 •

edited

Loading