diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 038605605462f..9dad99ffe9439 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { // Shuffles the the lanes inside the wavefront according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint32_t -__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { - return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1)); + return __builtin_amdgcn_ds_bpermute(__lane << 2, __x); } // Shuffles the the lanes inside the wavefront according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { uint32_t __hi = (uint32_t)(__x >> 32ull); uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); - return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) | - ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo)); + return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width) + << 32ull) | + ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); } // Returns true if the flat pointer points to AMDGPU 'shared' memory. diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 4c463c333308f..11c87e85cd497 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) { // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ float -__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) { +__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, + uint32_t __width) { return __builtin_bit_cast( float, __gpu_shuffle_idx_u32(__lane_mask, __idx, - __builtin_bit_cast(uint32_t, __x))); + __builtin_bit_cast(uint32_t, __x), __width)); } // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ double -__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) { +__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, + uint32_t __width) { return __builtin_bit_cast( - double, __gpu_shuffle_idx_u64(__lane_mask, __idx, - __builtin_bit_cast(uint64_t, __x))); + double, + __gpu_shuffle_idx_u64(__lane_mask, __idx, + __builtin_bit_cast(uint64_t, __x), __width)); } // Gets the sum of all lanes inside the warp or wavefront. @@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) { uint64_t __lane_mask, __type __x) { \ for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \ uint32_t __index = __step + __gpu_lane_id(); \ - __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \ + __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ + __gpu_num_lanes()); \ } \ return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ } @@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) uint32_t __index = __gpu_lane_id() - __step; \ __bitmask_type bitmask = __gpu_lane_id() >= __step; \ __x += __builtin_bit_cast( \ - __type, \ - -bitmask & __builtin_bit_cast(__bitmask_type, \ - __gpu_shuffle_idx_##__suffix( \ - __lane_mask, __index, __x))); \ + __type, -bitmask & __builtin_bit_cast(__bitmask_type, \ + __gpu_shuffle_idx_##__suffix( \ + __lane_mask, __index, __x, \ + __gpu_num_lanes()))); \ } \ return __x; \ } diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index fb2864eab6a09..40fa2edebe975 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { // Shuffles the the lanes inside the warp according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint32_t -__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { uint32_t __mask = (uint32_t)__lane_mask; - return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); + return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, + ((__gpu_num_lanes() - __width) << 8u) | 0x1f); } // Shuffles the the lanes inside the warp according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { uint32_t __hi = (uint32_t)(__x >> 32ull); uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); uint32_t __mask = (uint32_t)__lane_mask; - return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx, - __gpu_num_lanes() - 1u) + return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32ull) | - ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx, - __gpu_num_lanes() - 1u)); + ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); } // Returns true if the flat pointer points to CUDA 'shared' memory. diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index e138c84c0cb22..323c003f1ff07 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } -LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { - return __gpu_shuffle_idx_u32(lane_mask, idx, x); +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, + uint32_t width = __gpu_num_lanes()) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index 7811e0da45ddc..68bbc3849bc7e 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -9,3 +9,12 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_integration_test( + shuffle_test + SUITE libc-support-gpu-tests + SRCS + shuffle.cpp + LOADER_ARGS + --threads 64 +) diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp new file mode 100644 index 0000000000000..c346a2eb3f0c2 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/shuffle.cpp @@ -0,0 +1,33 @@ +//===-- Test for the shuffle operations on the GPU ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +// Test to make sure the shuffle instruction works by doing a simple broadcast. +// Each iteration reduces the width, so it will broadcast to a subset we check. +static void test_shuffle() { + uint64_t mask = gpu::get_lane_mask(); + EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size()); + + uint32_t x = gpu::get_lane_id(); + for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2) + EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + + test_shuffle(); + + return 0; +}