Skip to content

Commit 265570a

Browse files
authored
Split and rename AMDGPU ukernels (iree-org#19273)
1. Change ukernels prefix from `__iree_uk_rocm` to `iree_uk_amdgpu`. 2. Change ukernels to lowercase. 3. Split ukernels into separate .c files, one .c file <-> one ukernel function. --------- Signed-off-by: Benoit Jacob <[email protected]>
1 parent 5de0f06 commit 265570a

File tree

10 files changed

+238
-221
lines changed

10 files changed

+238
-221
lines changed

compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ foreach(_amd_chip ${_ukernel_supported_chips})
118118
ROCM_ARCH
119119
${_amd_chip}
120120
SRCS
121-
"argmax_ukernel.c"
121+
"iree_uk_amdgpu_argmax_f16i32.c"
122+
"iree_uk_amdgpu_argmax_f16i64.c"
123+
"iree_uk_amdgpu_argmax_f32i32.c"
124+
"iree_uk_amdgpu_argmax_f32i64.c"
122125
)
123126
endforeach()
124127

@@ -145,6 +148,10 @@ endforeach()
145148
# Generate a custom target with all file level dependencies and commands to
146149
# copy to our build tree locations.
147150
# Our GenDeviceLibs target depends on all of the defined device lib targets.
151+
message(STATUS "_all_ukernel_bc_files=${_all_ukernel_bc_files}")
152+
message(STATUS "_amd_ukernel_targets=${_amd_ukernel_targets}")
153+
message(STATUS "_all_ukernel_bc_copy_commands=${_all_ukernel_bc_copy_commands}")
154+
148155
add_custom_command(
149156
OUTPUT ${_all_ukernel_bc_files}
150157
DEPENDS ${_amd_ukernel_targets}

compiler/plugins/target/ROCM/builtins/ukernel/argmax_ukernel.c

Lines changed: 0 additions & 192 deletions
This file was deleted.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2024 The IREE Authors
2+
//
3+
// Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
8+
9+
void iree_uk_amdgpu_argmax_f16i32(const _Float16 *inputBuffer,
10+
int64_t input_offset, int32_t *outputBuffer,
11+
int64_t output_offset,
12+
int64_t reductionSize) {
13+
const int warpSize = __builtin_amdgcn_wavefrontsize();
14+
_Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
15+
int32_t laneID = __builtin_amdgcn_workitem_id_x();
16+
// Set identity value to handle problem non divisible by subgroupSize.
17+
_Float16 laneMax = laneID >= reductionSize
18+
? NEG_F16_MAX
19+
: inputBuffer[input_offset + laneID];
20+
int32_t laneResult = laneID;
21+
22+
int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
23+
for (int i = 1; i < numBatches; ++i) {
24+
int32_t idx = warpSize * i + laneID;
25+
_Float16 newIn =
26+
idx >= reductionSize ? NEG_F16_MAX : inputBuffer[input_offset + idx];
27+
if (newIn == laneMax)
28+
continue;
29+
laneMax = __builtin_fmaxf16(newIn, laneMax);
30+
laneResult = newIn == laneMax ? idx : laneResult;
31+
}
32+
// Final reduction with one subgroup
33+
_Float16 wgMax = __ockl_wfred_max_f16(laneMax);
34+
// Check if there are multiple max value holders.
35+
uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
36+
// if there is only one max value holder, write and exit.
37+
if (__builtin_popcountll(laneHasMaxValmask) == 1) {
38+
if (wgMax == laneMax)
39+
outputBuffer[output_offset] = laneResult;
40+
return;
41+
}
42+
43+
// if there are multiple max value holder, find smallest index (argmax
44+
// semantics).
45+
int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
46+
laneResult = __ockl_wfred_min_i32(indexVal);
47+
if (laneID == 0)
48+
outputBuffer[output_offset] = laneResult;
49+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2023 The IREE Authors
2+
//
3+
// Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
8+
9+
void iree_uk_amdgpu_argmax_f16i64(const _Float16 *inputBuffer,
10+
int64_t input_offset, int64_t *outputBuffer,
11+
int64_t output_offset,
12+
int64_t reductionSize) {
13+
const int warpSize = __builtin_amdgcn_wavefrontsize();
14+
_Float16 NEG_F16_MAX = (_Float16)(-65504.0f);
15+
int32_t laneID = __builtin_amdgcn_workitem_id_x();
16+
// Set identity value to handle problem non divisible by subgroupSize.
17+
_Float16 laneMax = laneID >= reductionSize
18+
? NEG_F16_MAX
19+
: inputBuffer[input_offset + laneID];
20+
int64_t laneResult = laneID;
21+
22+
int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
23+
for (int i = 1; i < numBatches; ++i) {
24+
int32_t idx = warpSize * i + laneID;
25+
_Float16 newIn =
26+
idx >= reductionSize ? NEG_F16_MAX : inputBuffer[input_offset + idx];
27+
if (newIn == laneMax)
28+
continue;
29+
laneMax = __builtin_fmaxf16(newIn, laneMax);
30+
laneResult = newIn == laneMax ? idx : laneResult;
31+
}
32+
33+
// Final reduction with one subgroup
34+
_Float16 wgMax = __ockl_wfred_max_f16(laneMax);
35+
// Check if there are multiple max value holders.
36+
uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
37+
// if there is only one max value holder, write and exit.
38+
if (__builtin_popcountll(laneHasMaxValmask) == 1) {
39+
if (wgMax == laneMax)
40+
outputBuffer[output_offset] = laneResult;
41+
return;
42+
}
43+
// if there are multiple max value holder, find smallest index (argmax
44+
// semantics).
45+
int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
46+
laneResult = __ockl_wfred_min_i64(indexVal);
47+
if (laneID == 0)
48+
outputBuffer[output_offset] = laneResult;
49+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright 2023 The IREE Authors
2+
//
3+
// Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
8+
9+
void iree_uk_amdgpu_argmax_f32i32(const float *inputBuffer,
10+
int64_t input_offset, int32_t *outputBuffer,
11+
int64_t output_offset,
12+
int64_t reductionSize) {
13+
const int warpSize = __builtin_amdgcn_wavefrontsize();
14+
int32_t laneID = __builtin_amdgcn_workitem_id_x();
15+
// Set identity value to handle problem non divisible by subgroupSize.
16+
float laneMax =
17+
laneID >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + laneID];
18+
int32_t laneResult = laneID;
19+
20+
// NOTE: On F32 kernels with clang, reductionSize/blockDim.x has numerical
21+
// inaccuracy.
22+
int32_t numBatches = (reductionSize + warpSize - 1) / warpSize;
23+
for (int i = 1; i < numBatches; ++i) {
24+
int32_t idx = warpSize * i + laneID;
25+
float newIn =
26+
idx >= reductionSize ? -FLT_MAX : inputBuffer[input_offset + idx];
27+
if (newIn == laneMax)
28+
continue;
29+
laneMax = __builtin_fmaxf(newIn, laneMax);
30+
laneResult = newIn == laneMax ? idx : laneResult;
31+
}
32+
33+
// Final reduction with one subgroup
34+
// NOTE: __ockl_wfred_max_f32 has correctness issue on gfx1100 documented on
35+
// https://github.com/iree-org/iree/issues/16112.
36+
float wgMax = laneMax;
37+
for (int i = 1; i < warpSize; i *= 2) {
38+
wgMax = __builtin_fmaxf(__shfl_xor_f(wgMax, i), wgMax);
39+
}
40+
// Check if there are multiple max value holders.
41+
uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
42+
// if there is only one max value holder, write and exit.
43+
if (__builtin_popcountll(laneHasMaxValmask) == 1) {
44+
if (wgMax == laneMax)
45+
outputBuffer[output_offset] = laneResult;
46+
return;
47+
}
48+
// if there are multiple max value holder, find smallest index (argmax
49+
// semantics).
50+
int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
51+
laneResult = __ockl_wfred_min_i32(indexVal);
52+
if (laneID == 0)
53+
outputBuffer[output_offset] = laneResult;
54+
}

0 commit comments

Comments
 (0)