-
Notifications
You must be signed in to change notification settings - Fork 15.3k
release/20.x: [Clang] Fix test after new argument was added #125912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@jhuber6 What do you think about merging this PR to the release branch? |
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-clang Author: None (llvmbot) ChangesRequested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/125912.diff 7 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3edf..89efe12ee8def8 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
__gpu_num_threads_z();
__gpu_num_threads(0);
__gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT: ]
+// AMDGPU: [[SW_BB]]:
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN:.*]]
+// AMDGPU: [[SW_BB1]]:
+// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_BB3]]:
+// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_DEFAULT]]:
+// AMDGPU-NEXT: unreachable
+// AMDGPU: [[RETURN]]:
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT: ]
+// NVPTX: [[SW_BB]]:
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN:.*]]
+// NVPTX: [[SW_BB1]]:
+// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_BB3]]:
+// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_DEFAULT]]:
+// NVPTX-NEXT: unreachable
+// NVPTX: [[RETURN]]:
+// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT: ret i32 [[TMP1]]
+//
__gpu_thread_id_y();
__gpu_thread_id_z();
__gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
__gpu_ballot(-1, 1);
__gpu_sync_threads();
__gpu_sync_lane(-1);
- __gpu_shuffle_idx_u32(-1, -1, -1);
+ __gpu_shuffle_idx_u32(-1, -1, -1, 0);
__gpu_first_lane_id(-1);
__gpu_is_first_in_lane(-1);
__gpu_exit();
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddcf..68bbc3849bc7ec 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ shuffle_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ shuffle.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 00000000000000..c346a2eb3f0c29
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+ uint32_t x = gpu::get_lane_id();
+ for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+ EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_shuffle();
+
+ return 0;
+}
|
|
@llvm/pr-subscribers-libc Author: None (llvmbot) ChangesRequested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/125912.diff 7 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3edf..89efe12ee8def8 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
__gpu_num_threads_z();
__gpu_num_threads(0);
__gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT: ]
+// AMDGPU: [[SW_BB]]:
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN:.*]]
+// AMDGPU: [[SW_BB1]]:
+// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_BB3]]:
+// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_DEFAULT]]:
+// AMDGPU-NEXT: unreachable
+// AMDGPU: [[RETURN]]:
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT: ]
+// NVPTX: [[SW_BB]]:
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN:.*]]
+// NVPTX: [[SW_BB1]]:
+// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_BB3]]:
+// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_DEFAULT]]:
+// NVPTX-NEXT: unreachable
+// NVPTX: [[RETURN]]:
+// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT: ret i32 [[TMP1]]
+//
__gpu_thread_id_y();
__gpu_thread_id_z();
__gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
__gpu_ballot(-1, 1);
__gpu_sync_threads();
__gpu_sync_lane(-1);
- __gpu_shuffle_idx_u32(-1, -1, -1);
+ __gpu_shuffle_idx_u32(-1, -1, -1, 0);
__gpu_first_lane_id(-1);
__gpu_is_first_in_lane(-1);
__gpu_exit();
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddcf..68bbc3849bc7ec 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ shuffle_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ shuffle.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 00000000000000..c346a2eb3f0c29
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+ uint32_t x = gpu::get_lane_id();
+ for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+ EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_shuffle();
+
+ return 0;
+}
|
|
@llvm/pr-subscribers-backend-x86 Author: None (llvmbot) ChangesRequested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/125912.diff 7 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 038605605462f8..9dad99ffe9439a 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
- return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
+ uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1));
+ return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}
// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
- return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
- ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
+ return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
+ << 32ull) |
+ ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 4c463c333308fc..11c87e85cd4975 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
-__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) {
+__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
+ uint32_t __width) {
return __builtin_bit_cast(
float, __gpu_shuffle_idx_u32(__lane_mask, __idx,
- __builtin_bit_cast(uint32_t, __x)));
+ __builtin_bit_cast(uint32_t, __x), __width));
}
// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ double
-__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
+__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
+ uint32_t __width) {
return __builtin_bit_cast(
- double, __gpu_shuffle_idx_u64(__lane_mask, __idx,
- __builtin_bit_cast(uint64_t, __x)));
+ double,
+ __gpu_shuffle_idx_u64(__lane_mask, __idx,
+ __builtin_bit_cast(uint64_t, __x), __width));
}
// Gets the sum of all lanes inside the warp or wavefront.
@@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
- __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+ __gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
@@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
- __type, \
- -bitmask & __builtin_bit_cast(__bitmask_type, \
- __gpu_shuffle_idx_##__suffix( \
- __lane_mask, __index, __x))); \
+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
+ __gpu_shuffle_idx_##__suffix( \
+ __lane_mask, __index, __x, \
+ __gpu_num_lanes()))); \
} \
return __x; \
}
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index fb2864eab6a09d..40fa2edebe975c 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint32_t
-__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
+__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
+ uint32_t __width) {
uint32_t __mask = (uint32_t)__lane_mask;
- return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u);
+ return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+ ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}
// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
+__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
+ uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx,
- __gpu_num_lanes() - 1u)
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
- ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx,
- __gpu_num_lanes() - 1u));
+ ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 281339716c3edf..89efe12ee8def8 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -38,7 +38,7 @@
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
@@ -70,7 +70,7 @@
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
@@ -90,6 +90,68 @@ __gpu_kernel void foo() {
__gpu_num_threads_z();
__gpu_num_threads(0);
__gpu_thread_id_x();
+// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
+// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT: [[ENTRY:.*:]]
+// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
+// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
+// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
+// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
+// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
+// AMDGPU-NEXT: ]
+// AMDGPU: [[SW_BB]]:
+// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN:.*]]
+// AMDGPU: [[SW_BB1]]:
+// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_BB3]]:
+// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
+// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: br label %[[RETURN]]
+// AMDGPU: [[SW_DEFAULT]]:
+// AMDGPU-NEXT: unreachable
+// AMDGPU: [[RETURN]]:
+// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
+// AMDGPU-NEXT: ret i32 [[TMP1]]
+//
+// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
+// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT: [[ENTRY:.*:]]
+// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
+// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
+// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
+// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
+// NVPTX-NEXT: ]
+// NVPTX: [[SW_BB]]:
+// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN:.*]]
+// NVPTX: [[SW_BB1]]:
+// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_BB3]]:
+// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
+// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
+// NVPTX-NEXT: br label %[[RETURN]]
+// NVPTX: [[SW_DEFAULT]]:
+// NVPTX-NEXT: unreachable
+// NVPTX: [[RETURN]]:
+// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
+// NVPTX-NEXT: ret i32 [[TMP1]]
+//
__gpu_thread_id_y();
__gpu_thread_id_z();
__gpu_thread_id(0);
@@ -100,7 +162,7 @@ __gpu_kernel void foo() {
__gpu_ballot(-1, 1);
__gpu_sync_threads();
__gpu_sync_lane(-1);
- __gpu_shuffle_idx_u32(-1, -1, -1);
+ __gpu_shuffle_idx_u32(-1, -1, -1, 0);
__gpu_first_lane_id(-1);
__gpu_is_first_in_lane(-1);
__gpu_exit();
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index e138c84c0cb22d..323c003f1ff074 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
-LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
- return __gpu_shuffle_idx_u32(lane_mask, idx, x);
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
+ uint32_t width = __gpu_num_lanes()) {
+ return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 7811e0da45ddcf..68bbc3849bc7ec 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -9,3 +9,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ shuffle_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ shuffle.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp
new file mode 100644
index 00000000000000..c346a2eb3f0c29
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/shuffle.cpp
@@ -0,0 +1,33 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to make sure the shuffle instruction works by doing a simple broadcast.
+// Each iteration reduces the width, so it will broadcast to a subset we check.
+static void test_shuffle() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size());
+
+ uint32_t x = gpu::get_lane_id();
+ for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2)
+ EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width);
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_shuffle();
+
+ return 0;
+}
|
|
@shiltian would do you think about backporting this? |
|
LGTM |
|
@jhuber6 Does this need an extra patch to fix the build failures? |
|
Should just be the two commits, it just unfortunately took the name of the latest one. I would've squashed them if I knew how. |
|
@jhuber6 Can you take a look at these test failures. |
I don't understand the logs, I looked through it but didn't see what failure it listed. It just said that the build failed but didn't list any failing tests. Seems to be a build failure relating to |
Looks green now. |
) Summary: The CUDA impelementation has long supported the `width` argument on its shuffle instrucitons, which makes it more difficult to replace those uses with this helper. This patch just correctly implements that for AMDGPU and NVPTX so it's equivalent to `__shfl_sync` in CUDA. This will ease porting. Fortunately these get optimized out correctly when passing in known widths. (cherry picked from commit 2d8106c)
(cherry picked from commit 718cdeb)
|
@jhuber6 (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
Backport 2d8106c 718cdeb
Requested by: @jhuber6