ziereis
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt‎
Lines changed: 104 additions & 0 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_bf16i32.c‎
Lines changed: 71 additions & 0 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_bf16i32.c‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_bf16i64.c‎
Lines changed: 71 additions & 0 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_bf16i64.c‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c‎
Lines changed: 13 additions & 6 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c‎
Lines changed: 13 additions & 6 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c‎
Lines changed: 12 additions & 6 deletions b/‎compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c‎
Lines changed: 12 additions & 6 deletions
@@ -37,6 +37,8 @@ gpu_archs = [
 
 # Element type combinations for the argmax ukernel.
 argmax_types = [
+    "bf16i32",
+    "bf16i64",
     "f16i32",
     "f16i64",
     "f32i32",
 
@@ -14,6 +14,102 @@ if(NOT IREE_TARGET_BACKEND_ROCM)
   return()
 endif()
 
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i32_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i32.gfx90a.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i32_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i32.gfx942.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i32_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i32.gfx1030.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i32_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i32.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i32.gfx1100.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i64_gfx90a
+  GPU_ARCH
+    gfx90a
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i64.gfx90a.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i64_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i64.gfx942.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i64_gfx1030
+  GPU_ARCH
+    gfx1030
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i64.gfx1030.bc"
+)
+
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_argmax_bf16i64_gfx1100
+  GPU_ARCH
+    gfx1100
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_argmax_bf16i64.c"
+  OUT
+    "iree_uk_amdgpu_argmax_bf16i64.gfx1100.bc"
+)
+
 iree_amdgpu_bitcode_library(
   NAME
     iree_uk_amdgpu_argmax_f16i32_gfx90a
@@ -222,6 +318,14 @@ iree_c_embed_data(
   NAME
     iree_uk_amdgpu_bitcode
   SRCS
+    "iree_uk_amdgpu_argmax_bf16i32.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_bf16i32.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_bf16i32.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_bf16i32.gfx942.bc"
+    "iree_uk_amdgpu_argmax_bf16i64.gfx1030.bc"
+    "iree_uk_amdgpu_argmax_bf16i64.gfx1100.bc"
+    "iree_uk_amdgpu_argmax_bf16i64.gfx90a.bc"
+    "iree_uk_amdgpu_argmax_bf16i64.gfx942.bc"
     "iree_uk_amdgpu_argmax_f16i32.gfx1030.bc"
     "iree_uk_amdgpu_argmax_f16i32.gfx1100.bc"
     "iree_uk_amdgpu_argmax_f16i32.gfx90a.bc"
 
@@ -0,0 +1,71 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
+
+[[clang::always_inline]] void iree_uk_amdgpu_argmax_bf16i32(
+    const __bf16 *inputBuffer, int64_t input_offset, __bf16 *outputBufferVal,
+    int64_t output_val_offset, int32_t *outputBufferIdx,
+    int64_t output_idx_offset, int64_t reductionSize, bool writeValue) {
+  // NOTE:
+  // We convert bf16 inputs to f32 before computation because HIP/OCKL and
+  // Clang/LLVM do not currently support native arithmetic or comparisons on
+  // bf16. In practice, these operations are internally performed by first
+  // converting bf16 to float.
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
+  // Set identity value to handle problem non divisible by subgroupSize.
+  float laneMax = laneID >= reductionSize
+                      ? -FLT_MAX
+                      : (float)(inputBuffer[input_offset + laneID]);
+  int32_t laneResult = laneID;
+
+  // NOTE: On F32 kernels with clang, reductionSize/blockDim.x has numerical
+  // inaccuracy.
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
+  for (int i = 1; i < numBatches; ++i) {
+    int32_t idx = warpSize * i + laneID;
+    float newIn = idx >= reductionSize
+                      ? -FLT_MAX
+                      : (float)(inputBuffer[input_offset + idx]);
+    if (newIn == laneMax)
+      continue;
+    laneMax = __builtin_fmaxf(newIn, laneMax);
+    laneResult = newIn == laneMax ? idx : laneResult;
+  }
+
+  // Final reduction with one subgroup
+  // NOTE: __ockl_wfred_max_f32 has correctness issue on gfx1100 documented
+  // on https://github.com/iree-org/iree/issues/16112.
+  float wgMax = laneMax;
+  for (int i = 1; i < warpSize; i *= 2) {
+    wgMax = __builtin_fmaxf(__shfl_xor_f(wgMax, i), wgMax);
+  }
+  // Check if there are multiple max value holders.
+  uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
+  // if there is only one max value holder, write and exit.
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
+    if (wgMax == laneMax) {
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = (__bf16)wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
+    laneResult = __ockl_wfred_min_i32(indexVal);
+    if (laneID == 0) {
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = (__bf16)wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
+    }
+  }
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
+}
@@ -0,0 +1,71 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
+
+[[clang::always_inline]] void iree_uk_amdgpu_argmax_bf16i64(
+    const __bf16 *inputBuffer, int64_t input_offset, __bf16 *outputBufferVal,
+    int64_t output_val_offset, int64_t *outputBufferIdx,
+    int64_t output_idx_offset, int64_t reductionSize, bool writeValue) {
+  // NOTE:
+  // We convert bf16 inputs to f32 before computation because HIP/OCKL and
+  // Clang/LLVM do not currently support native arithmetic or comparisons on
+  // bf16. In practice, these operations are internally performed by first
+  // converting bf16 to float.
+  const int warpSize = __builtin_amdgcn_wavefrontsize();
+  int32_t laneID = __builtin_amdgcn_workitem_id_x();
+  // Set identity value to handle problem non divisible by subgroupSize.
+  float laneMax = laneID >= reductionSize
+                      ? -FLT_MAX
+                      : (float)(inputBuffer[input_offset + laneID]);
+  int64_t laneResult = laneID;
+
+  // NOTE: On F32 kernels with clang, reductionSize/blockDim.x has numerical
+  // inaccuracy.
+  int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
+  for (int i = 1; i < numBatches; ++i) {
+    int32_t idx = warpSize * i + laneID;
+    float newIn = idx >= reductionSize
+                      ? -FLT_MAX
+                      : (float)(inputBuffer[input_offset + idx]);
+    if (newIn == laneMax)
+      continue;
+    laneMax = __builtin_fmaxf(newIn, laneMax);
+    laneResult = newIn == laneMax ? idx : laneResult;
+  }
+
+  // Final reduction with one subgroup
+  // NOTE: __ockl_wfred_max_f32 has correctness issue on gfx1100 documented on
+  // https://github.com/iree-org/iree/issues/16112.
+  float wgMax = laneMax;
+  for (int i = 1; i < warpSize; i *= 2) {
+    wgMax = __builtin_fmaxf(__shfl_xor_f(wgMax, i), wgMax);
+  }
+  // Check if there are multiple max value holders.
+  uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
+  // if there is only one max value holder, write and exit.
+  if (__builtin_popcountll(laneHasMaxValmask) == 1) {
+    if (wgMax == laneMax) {
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = (__bf16)wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
+    laneResult = __ockl_wfred_min_i64(indexVal);
+    if (laneID == 0) {
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = (__bf16)wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
+    }
+  }
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
+}
@@ -6,10 +6,11 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-[[clang::always_inline]] void
-iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, int64_t input_offset,
-                             int32_t *outputBuffer, int64_t output_offset,
-                             int64_t reductionSize) {
+[[clang::always_inline]] void iree_uk_amdgpu_argmax_f16i32(
+    const _Float16 *inputBuffer, int64_t input_offset,
+    _Float16 *outputBufferVal, int64_t output_val_offset,
+    int32_t *outputBufferIdx, int64_t output_idx_offset, int64_t reductionSize,
+    bool writeValue) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
@@ -36,15 +37,21 @@ iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer, int64_t input_offset,
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   } else {
     // if there are multiple max value holder, find smallest index (argmax
     // semantics).
     int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
     laneResult = __ockl_wfred_min_i32(indexVal);
     if (laneID == 0) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   }
   // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
 
@@ -6,10 +6,11 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-[[clang::always_inline]] void
-iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, int64_t input_offset,
-                             int64_t *outputBuffer, int64_t output_offset,
-                             int64_t reductionSize) {
+[[clang::always_inline]] void iree_uk_amdgpu_argmax_f16i64(
+    const _Float16 *inputBuffer, int64_t input_offset,
+    _Float16 *outputBufferVal, int64_t output_val_offset,
+    int64_t *outputBufferIdx, int64_t output_idx_offset, int64_t reductionSize,
+    bool writeValue) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   _Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
@@ -37,15 +38,21 @@ iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer, int64_t input_offset,
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   } else {
     // if there are multiple max value holder, find smallest index (argmax
     // semantics).
     int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
     laneResult = __ockl_wfred_min_i64(indexVal);
     if (laneID == 0) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   }
   // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
 
@@ -6,10 +6,10 @@
 
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
-[[clang::always_inline]] void
-iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, int64_t input_offset,
-                             int32_t *outputBuffer, int64_t output_offset,
-                             int64_t reductionSize) {
+[[clang::always_inline]] void iree_uk_amdgpu_argmax_f32i32(
+    const float *inputBuffer, int64_t input_offset, float *outputBufferVal,
+    int64_t output_val_offset, int32_t *outputBufferIdx,
+    int64_t output_idx_offset, int64_t reductionSize, bool writeValue) {
   const int warpSize = __builtin_amdgcn_wavefrontsize();
   int32_t laneID = __builtin_amdgcn_workitem_id_x();
   // Set identity value to handle problem non divisible by subgroupSize.
@@ -42,15 +42,21 @@ iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer, int64_t input_offset,
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
     if (wgMax == laneMax) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   } else {
     // if there are multiple max value holder, find smallest index (argmax
     // semantics).
     int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
     laneResult = __ockl_wfred_min_i32(indexVal);
     if (laneID == 0) {
-      outputBuffer[output_offset] = laneResult;
+      if (writeValue) {
+        outputBufferVal[output_val_offset] = wgMax;
+      }
+      outputBufferIdx[output_idx_offset] = laneResult;
     }
   }
   // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?