iree-org
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt‎
Lines changed: 24 additions & 39 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt‎
Lines changed: 24 additions & 39 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/argmax_ukernel.c‎
Lines changed: 58 additions & 64 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/argmax_ukernel.c‎
Lines changed: 58 additions & 64 deletions
@@ -7,29 +7,13 @@ if(NOT IREE_TARGET_BACKEND_ROCM)
   return()
 endif()
 
-# Check if HIP is installed on system.
-# HIP is required to compile ukernels.
-# TODO: We can do better than this and ensure that headers are always available.
-if(NOT IREE_ROCM_PATH)
-  set(IREE_ROCM_PATH "/opt/rocm")
-endif()
-set(IREE_ROCM_VERSION "${IREE_ROCM_PATH}/include/hip/hip_version.h")
-if(NOT EXISTS ${IREE_ROCM_VERSION})
-  message(STATUS
-          "hip runtime cannot be found in ${IREE_ROCM_PATH}.
-          Please try setting IREE_ROCM_PATH to rocm directory.
-          Ukernels will not be compiled.")
-  return()
-endif()
-
-
 iree_add_all_subdirs()
 
 set(_platform_lib_reldir "iree_platform_libs/rocm")
 set(_device_bc_path "${IREE_COMPILER_DYLIB_DIR}/iree_platform_libs/rocm")
 set(_amd_ukernel_libs)
 set(_amd_ukernel_targets)
-function(iree_rocm_bitcode_library)
+function(iree_amdgpu_bitcode_library)
   cmake_parse_arguments(
     _RULE
     ""
@@ -45,30 +29,29 @@ function(iree_rocm_bitcode_library)
   endif()
 
   set(_ROCM_ARCH "${_RULE_ROCM_ARCH}")
-  set(OPT_FLAG "-O0")
-  if(_ROCM_ARCH MATCHES "GFX9")
-    set(OPT_FLAG "-O3")
-  endif()
   set(_COPTS
-    "-x" "hip"
+    # Language: C23
+    "-x" "c"
+    "-std=c23"
 
-    # Compile only the device code for the target architecture.
-    "--offload-device-only"
-    "--offload-arch=${_ROCM_ARCH}"
+    # Local headers.
+    "-I${IREE_SOURCE_DIR}"
 
-    # Suppress warnings about about ROCM version (we mostly don't care).
-    "-D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH"
+    # Avoid dependencies.
+    "-nogpulib"
 
-    # Use the ROCM specified by the IREE cmake variable (instead of guessing
-    # or failing if ROCM is not on the user's path).
-    "--rocm-path=${IREE_ROCM_PATH}"
+    # Avoid ABI issues.
+    "-fno-short-wchar"  # Shouldn't matter to us, but doesn't hurt.
 
-    # Avoid linking in default libraries as we will link them at a later phase.
-    "-nogpulib"
+    # Target architecture/machine.
+    "-target" "amdgcn-amd-amdhsa"
+    "-march=${_ROCM_ARCH}"
+    "-fgpu-rdc"  # NOTE: may not be required for all targets.
 
-    # Only enable necessary optimizations S.T we can use -O3.
-    "-Xclang" "-disable-llvm-optzns"
-    "${OPT_FLAG}"
+    # Optimized.
+    "-O3"
+    "-fno-ident"
+    "-fvisibility=hidden"
 
     # Object file only in bitcode format:
     "-c"
@@ -77,7 +60,8 @@ function(iree_rocm_bitcode_library)
 
   set(_BITCODE_FILES)
   foreach(_SRC ${_RULE_SRCS})
-    get_filename_component(_BITCODE_SRC_PATH "${_SRC}" REALPATH)
+    get_filename_component(_SRC_PATH "${_SRC}" REALPATH)
+    get_filename_component(_COMMON_H_PATH "common.h" REALPATH)
     set(_BITCODE_FILE "${_RULE_NAME}_${_SRC}_${_ROCM_ARCH}.bc")
     list(APPEND _BITCODE_FILES ${_BITCODE_FILE})
     add_custom_command(
@@ -86,12 +70,13 @@ function(iree_rocm_bitcode_library)
       COMMAND
         "${IREE_CLANG_BINARY}"
         ${_COPTS}
-        "${_BITCODE_SRC_PATH}"
+        "${_SRC_PATH}"
         "-o"
         "${_BITCODE_FILE}"
       DEPENDS
         "${IREE_CLANG_BINARY}"
-        "${_SRC}"
+        "${_SRC_PATH}"
+        "${_COMMON_H_PATH}"
       COMMENT
         "Compiling ${_SRC} to ${_BITCODE_FILE}"
       VERBATIM
@@ -127,7 +112,7 @@ endfunction()
 #       except compile-time cost, so just picked out the popular ones.
 set(_ukernel_supported_chips "gfx90a" "gfx942" "gfx1030" "gfx1100")
 foreach(_amd_chip ${_ukernel_supported_chips})
-  iree_rocm_bitcode_library(
+  iree_amdgpu_bitcode_library(
     NAME
       rocm_argmax_ukernel
     ROCM_ARCH
 
@@ -4,15 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <float.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
-
-extern "C" __device__ __attribute__((const)) half __ockl_wfred_max_f16(half);
-extern "C" __device__
-    __attribute__((const)) int64_t __ockl_wfred_min_i64(int64_t);
-extern "C" __device__
-    __attribute__((const)) int32_t __ockl_wfred_min_i32(int32_t);
+#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
 /*
 Constraint/Tiling note:
@@ -21,27 +13,27 @@ only use single subgroup/warp per workgroup. This constraint is also set during
 tiling phase in KernelConfig.
 */
 
-extern "C" __device__ void __iree_uk_rocm_argmax_F32I32(float *inputBuffer,
-                                                        size_t input_offset,
-                                                        int32_t *outputBuffer,
-                                                        size_t output_offset,
-                                                        size_t reductionSize) {
-  uint laneID = __builtin_amdgcn_workitem_id_x();
+void __iree_uk_rocm_argmax_F32I32(const float *inputBuffer,
+                                  int64_t input_offset, int32_t *outputBuffer,
+                                  int64_t output_offset,
+                                  int64_t reductionSize) {
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
   float laneMax =
       laneID >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + laneID];
   int32_t laneResult = laneID;
 
   // NOTE: On F32 kernels with clang, reductionSize/blockDim.x has numerical
   // inaccuracy.
-  uint numBatches = (reductionSize + warpSize - 1) / warpSize;
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
   for (int i = 1; i < numBatches; ++i) {
-    uint idx = warpSize * i + laneID;
+    int32_t idx = warpSize * i + laneID;
     float newIn =
         idx >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + idx];
     if (newIn == laneMax)
       continue;
-    laneMax = __ocml_fmax_f32(newIn, laneMax);
+    laneMax = __builtin_fmaxf(newIn, laneMax);
     laneResult = newIn == laneMax ? idx : laneResult;
   }
 
@@ -50,12 +42,12 @@ extern "C" __device__ void __iree_uk_rocm_argmax_F32I32(float *inputBuffer,
   // https://github.com/iree-org/iree/issues/16112.
   float wgMax = laneMax;
   for (int i = 1; i < warpSize; i *= 2) {
-    wgMax = __ocml_fmax_f32(__shfl_xor(wgMax, i), wgMax);
+    wgMax = __builtin_fmaxf(__shfl_xor_f(wgMax, i), wgMax);
   }
   // Check if there are multiple max value holders.
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
-  if (__popcll(laneHasMaxValmask) == 1) {
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax)
       outputBuffer[output_offset] = laneResult;
     return;
@@ -68,27 +60,27 @@ extern "C" __device__ void __iree_uk_rocm_argmax_F32I32(float *inputBuffer,
     outputBuffer[output_offset] = laneResult;
 }
 
-extern "C" __device__ void __iree_uk_rocm_argmax_F32I64(float *inputBuffer,
-                                                        size_t input_offset,
-                                                        int64_t *outputBuffer,
-                                                        size_t output_offset,
-                                                        size_t reductionSize) {
-  uint laneID = __builtin_amdgcn_workitem_id_x();
+void __iree_uk_rocm_argmax_F32I64(const float *inputBuffer,
+                                  int64_t input_offset, int64_t *outputBuffer,
+                                  int64_t output_offset,
+                                  int64_t reductionSize) {
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
   float laneMax =
       laneID >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + laneID];
   int64_t laneResult = laneID;
 
   // NOTE: On F32 kernels with clang, reductionSize/blockDim.x has numerical
   // inaccuracy.
-  uint numBatches = (reductionSize + warpSize - 1) / warpSize;
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
   for (int i = 1; i < numBatches; ++i) {
-    uint idx = warpSize * i + laneID;
+    int32_t idx = warpSize * i + laneID;
     float newIn =
         idx >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + idx];
     if (newIn == laneMax)
       continue;
-    laneMax = __ocml_fmax_f32(newIn, laneMax);
+    laneMax = __builtin_fmaxf(newIn, laneMax);
     laneResult = newIn == laneMax ? idx : laneResult;
   }
 
@@ -97,57 +89,58 @@ extern "C" __device__ void __iree_uk_rocm_argmax_F32I64(float *inputBuffer,
   // https://github.com/iree-org/iree/issues/16112.
   float wgMax = laneMax;
   for (int i = 1; i < warpSize; i *= 2) {
-    wgMax = __ocml_fmax_f32(__shfl_xor(wgMax, i), wgMax);
+    wgMax = __builtin_fmaxf(__shfl_xor_f(wgMax, i), wgMax);
   }
   // Check if there are multiple max value holders.
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
-  if (__popcll(laneHasMaxValmask) == 1) {
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax)
       outputBuffer[output_offset] = laneResult;
     return;
   }
   // if there are multiple max value holder, find smallest index (argmax
   // semantics).
-  int64_t indexVal = wgMax == laneMax ? laneResult : __INT64_MAX__;
+  int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
   laneResult = __ockl_wfred_min_i64(indexVal);
   if (laneID == 0)
     outputBuffer[output_offset] = laneResult;
 }
 
-extern "C" __device__ void __iree_uk_rocm_argmax_F16I32(half *inputBuffer,
-                                                        size_t input_offset,
-                                                        int32_t *outputBuffer,
-                                                        size_t output_offset,
-                                                        size_t reductionSize) {
-  half NEG_F16_MAX = __float2half(-65504.0f);
-  uint laneID = __builtin_amdgcn_workitem_id_x();
+void __iree_uk_rocm_argmax_F16I32(const _Float16 *inputBuffer,
+                                  int64_t input_offset, int32_t *outputBuffer,
+                                  int64_t output_offset,
+                                  int64_t reductionSize) {
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
-  half laneMax = laneID >= reductionSize ? NEG_F16_MAX
-                                         : inputBuffer[input_offset + laneID];
+  _Float16 laneMax = laneID >= reductionSize
+                         ? NEG_F16_MAX
+                         : inputBuffer[input_offset + laneID];
   int32_t laneResult = laneID;
 
-  uint numBatches = (reductionSize + warpSize - 1) / warpSize;
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
   for (int i = 1; i < numBatches; ++i) {
-    uint idx = warpSize * i + laneID;
-    half newIn =
+    int32_t idx = warpSize * i + laneID;
+    _Float16 newIn =
         idx >= reductionSize ? NEG_F16_MAX : inputBuffer[input_offset + idx];
     if (newIn == laneMax)
       continue;
-    laneMax = __ocml_fmax_f16(newIn, laneMax);
+    laneMax = __builtin_fmaxf16(newIn, laneMax);
     laneResult = newIn == laneMax ? idx : laneResult;
   }
-
   // Final reduction with one subgroup
-  half wgMax = __ockl_wfred_max_f16(laneMax);
+  _Float16 wgMax = __ockl_wfred_max_f16(laneMax);
   // Check if there are multiple max value holders.
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
-  if (__popcll(laneHasMaxValmask) == 1) {
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax)
       outputBuffer[output_offset] = laneResult;
     return;
   }
+
   // if there are multiple max value holder, find smallest index (argmax
   // semantics).
   int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
@@ -156,42 +149,43 @@ extern "C" __device__ void __iree_uk_rocm_argmax_F16I32(half *inputBuffer,
     outputBuffer[output_offset] = laneResult;
 }
 
-extern "C" __device__ void __iree_uk_rocm_argmax_F16I64(half *inputBuffer,
-                                                        size_t input_offset,
-                                                        int64_t *outputBuffer,
-                                                        size_t output_offset,
-                                                        size_t reductionSize) {
-  half NEG_F16_MAX = __float2half(-65504.0f);
-  uint laneID = __builtin_amdgcn_workitem_id_x();
+void __iree_uk_rocm_argmax_F16I64(const _Float16 *inputBuffer,
+                                  int64_t input_offset, int64_t *outputBuffer,
+                                  int64_t output_offset,
+                                  int64_t reductionSize) {
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
-  half laneMax = laneID >= reductionSize ? NEG_F16_MAX
-                                         : inputBuffer[input_offset + laneID];
+  _Float16 laneMax = laneID >= reductionSize
+                         ? NEG_F16_MAX
+                         : inputBuffer[input_offset + laneID];
   int64_t laneResult = laneID;
 
-  uint numBatches = (reductionSize + warpSize - 1) / warpSize;
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
   for (int i = 1; i < numBatches; ++i) {
-    uint idx = warpSize * i + laneID;
-    half newIn =
+    int32_t idx = warpSize * i + laneID;
+    _Float16 newIn =
         idx >= reductionSize ? NEG_F16_MAX : inputBuffer[input_offset + idx];
     if (newIn == laneMax)
       continue;
-    laneMax = __ocml_fmax_f16(newIn, laneMax);
+    laneMax = __builtin_fmaxf16(newIn, laneMax);
     laneResult = newIn == laneMax ? idx : laneResult;
   }
 
   // Final reduction with one subgroup
-  half wgMax = __ockl_wfred_max_f16(laneMax);
+  _Float16 wgMax = __ockl_wfred_max_f16(laneMax);
   // Check if there are multiple max value holders.
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
-  if (__popcll(laneHasMaxValmask) == 1) {
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax)
       outputBuffer[output_offset] = laneResult;
     return;
   }
   // if there are multiple max value holder, find smallest index (argmax
   // semantics).
-  int64_t indexVal = wgMax == laneMax ? laneResult : __INT64_MAX__;
+  int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
   laneResult = __ockl_wfred_min_i64(indexVal);
   if (laneID == 0)
     outputBuffer[output_offset] = laneResult;