diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 67242cd4d981b..813fc86a5bb5c 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -273,6 +273,12 @@ set(x86_files cpuid.h ) +set(gpu_files + gpuintrin.h + nvptxintrin.h + amdgpuintrin.h + ) + set(windows_only_files intrin0.h intrin.h @@ -301,6 +307,7 @@ set(files ${systemz_files} ${ve_files} ${x86_files} + ${gpu_files} ${webassembly_files} ${windows_only_files} ${utility_files} @@ -523,6 +530,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil add_header_target("ve-resource-headers" "${ve_files}") add_header_target("webassembly-resource-headers" "${webassembly_files}") add_header_target("x86-resource-headers" "${x86_files}") +add_header_target("gpu-resource-headers" "${gpu_files}") # Other header groupings add_header_target("hlsl-resource-headers" ${hlsl_files}) @@ -709,6 +717,12 @@ install( EXCLUDE_FROM_ALL COMPONENT x86-resource-headers) +install( + FILES ${gpu_files} + DESTINATION ${header_install_dir} + EXCLUDE_FROM_ALL + COMPONENT gpu-resource-headers) + if(NOT CLANG_ENABLE_HLSL) set(EXCLUDE_HLSL EXCLUDE_FROM_ALL) endif() diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h new file mode 100644 index 0000000000000..f4f90b394522d --- /dev/null +++ b/clang/lib/Headers/amdgpuintrin.h @@ -0,0 +1,190 @@ +//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __AMDGPUINTRIN_H +#define __AMDGPUINTRIN_H + +#ifndef __AMDGPU__ +#error "This file is intended for AMDGPU targets or offloading to AMDGPU" +#endif + +#include + +#if !defined(__cplusplus) +_Pragma("push_macro(\"bool\")"); +#define bool _Bool +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); + +// Type aliases to the address spaces used by the AMDGPU backend. +#define __gpu_private __attribute__((opencl_private)) +#define __gpu_constant __attribute__((opencl_constant)) +#define __gpu_local __attribute__((opencl_local)) +#define __gpu_global __attribute__((opencl_global)) +#define __gpu_generic __attribute__((opencl_generic)) + +// Attribute to declare a function as a kernel. +#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected"))) + +// Returns the number of workgroups in the 'x' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); +} + +// Returns the number of workgroups in the 'y' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); +} + +// Returns the number of workgroups in the 'z' dimension of the grid. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); +} + +// Returns the 'x' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __builtin_amdgcn_workgroup_id_x(); +} + +// Returns the 'y' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __builtin_amdgcn_workgroup_id_y(); +} + +// Returns the 'z' dimension of the current AMD workgroup's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __builtin_amdgcn_workgroup_id_z(); +} + +// Returns the number of workitems in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __builtin_amdgcn_workgroup_size_x(); +} + +// Returns the number of workitems in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __builtin_amdgcn_workgroup_size_y(); +} + +// Returns the number of workitems in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __builtin_amdgcn_workgroup_size_z(); +} + +// Returns the 'x' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __builtin_amdgcn_workitem_id_x(); +} + +// Returns the 'y' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __builtin_amdgcn_workitem_id_y(); +} + +// Returns the 'z' dimension id of the workitem in the current AMD workgroup. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __builtin_amdgcn_workitem_id_z(); +} + +// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware +// and compilation options. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { + return __builtin_amdgcn_wavefrontsize(); +} + +// Returns the id of the thread inside of an AMD wavefront executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} + +// Returns the bit-mask of active threads in the current wavefront. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + return __builtin_amdgcn_read_exec(); +} + +// Copies the value from the first active thread in the wavefront to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + return __builtin_amdgcn_readfirstlane(__x); +} + +// Copies the value from the first active thread in the wavefront to the rest. +_DEFAULT_FN_ATTRS __inline__ uint64_t +__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); + return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) | + ((uint64_t)__builtin_amdgcn_readfirstlane(__lo)); +} + +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + // The lane_mask & gives the nvptx semantics when lane_mask is a subset of + // the active threads + return __lane_mask & __builtin_amdgcn_ballot_w64(__x); +} + +// Waits for all the threads in the block to converge and issues a fence. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { + __builtin_amdgcn_s_barrier(); + __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); +} + +// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { + __builtin_amdgcn_wave_barrier(); +} + +// Shuffles the the lanes inside the wavefront according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { + return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); +} + +// Shuffles the the lanes inside the wavefront according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); + return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) | + ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo)); +} + +// Returns true if the flat pointer points to CUDA 'shared' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { + return __builtin_amdgcn_is_shared( + (void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr)); +} + +// Returns true if the flat pointer points to CUDA 'local' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) { + return __builtin_amdgcn_is_private( + (void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr)); +} + +// Terminates execution of the associated wavefront. +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { + __builtin_amdgcn_endpgm(); +} + +// Suspend the thread briefly to assist the scheduler during busy loops. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { + __builtin_amdgcn_s_sleep(2); +} + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#if !defined(__cplusplus) +_Pragma("pop_macro(\"bool\")"); +#endif + +#endif // __AMDGPUINTRIN_H diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h new file mode 100644 index 0000000000000..4c463c333308f --- /dev/null +++ b/clang/lib/Headers/gpuintrin.h @@ -0,0 +1,196 @@ +//===-- gpuintrin.h - Generic GPU intrinsic functions ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Provides wrappers around the clang builtins for accessing GPU hardware +// features. The interface is intended to be portable between architectures, but +// some targets may provide different implementations. This header can be +// included for all the common GPU programming languages, namely OpenMP, HIP, +// CUDA, and OpenCL. +// +//===----------------------------------------------------------------------===// + +#ifndef __GPUINTRIN_H +#define __GPUINTRIN_H + +#if !defined(_DEFAULT_FN_ATTRS) +#if defined(__HIP__) || defined(__CUDA__) +#define _DEFAULT_FN_ATTRS __attribute__((device)) +#else +#define _DEFAULT_FN_ATTRS +#endif +#endif + +#if defined(__NVPTX__) +#include +#elif defined(__AMDGPU__) +#include +#elif !defined(_OPENMP) +#error "This header is only meant to be used on GPU architectures." +#endif + +#include + +#if !defined(__cplusplus) +_Pragma("push_macro(\"bool\")"); +#define bool _Bool +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {kind(gpu)})"); + +#define __GPU_X_DIM 0 +#define __GPU_Y_DIM 1 +#define __GPU_Z_DIM 2 + +// Returns the number of blocks in the requested dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks(int __dim) { + switch (__dim) { + case 0: + return __gpu_num_blocks_x(); + case 1: + return __gpu_num_blocks_y(); + case 2: + return __gpu_num_blocks_z(); + default: + __builtin_unreachable(); + } +} + +// Returns the number of block id in the requested dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id(int __dim) { + switch (__dim) { + case 0: + return __gpu_block_id_x(); + case 1: + return __gpu_block_id_y(); + case 2: + return __gpu_block_id_z(); + default: + __builtin_unreachable(); + } +} + +// Returns the number of threads in the requested dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads(int __dim) { + switch (__dim) { + case 0: + return __gpu_num_threads_x(); + case 1: + return __gpu_num_threads_y(); + case 2: + return __gpu_num_threads_z(); + default: + __builtin_unreachable(); + } +} + +// Returns the thread id in the requested dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id(int __dim) { + switch (__dim) { + case 0: + return __gpu_thread_id_x(); + case 1: + return __gpu_thread_id_y(); + case 2: + return __gpu_thread_id_z(); + default: + __builtin_unreachable(); + } +} + +// Get the first active thread inside the lane. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_first_lane_id(uint64_t __lane_mask) { + return __builtin_ffsll(__lane_mask) - 1; +} + +// Conditional that is only true for a single thread in a lane. +_DEFAULT_FN_ATTRS static __inline__ bool +__gpu_is_first_in_lane(uint64_t __lane_mask) { + return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask); +} + +// Gets the first floating point value from the active lanes. +_DEFAULT_FN_ATTRS static __inline__ float +__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) { + return __builtin_bit_cast( + float, __gpu_read_first_lane_u32(__lane_mask, + __builtin_bit_cast(uint32_t, __x))); +} + +// Gets the first floating point value from the active lanes. +_DEFAULT_FN_ATTRS static __inline__ double +__gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) { + return __builtin_bit_cast( + double, __gpu_read_first_lane_u64(__lane_mask, + __builtin_bit_cast(uint64_t, __x))); +} + +// Shuffles the the lanes according to the given index. +_DEFAULT_FN_ATTRS static __inline__ float +__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) { + return __builtin_bit_cast( + float, __gpu_shuffle_idx_u32(__lane_mask, __idx, + __builtin_bit_cast(uint32_t, __x))); +} + +// Shuffles the the lanes according to the given index. +_DEFAULT_FN_ATTRS static __inline__ double +__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) { + return __builtin_bit_cast( + double, __gpu_shuffle_idx_u64(__lane_mask, __idx, + __builtin_bit_cast(uint64_t, __x))); +} + +// Gets the sum of all lanes inside the warp or wavefront. +#define __DO_LANE_SUM(__type, __suffix) \ + _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \ + uint64_t __lane_mask, __type __x) { \ + for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \ + uint32_t __index = __step + __gpu_lane_id(); \ + __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \ + } \ + return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ + } +__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x) +__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x) +__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x) +__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) +#undef __DO_LANE_SUM + +// Gets the accumulator scan of the threads in the warp or wavefront. +#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \ + _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \ + uint64_t __lane_mask, uint32_t __x) { \ + for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ + uint32_t __index = __gpu_lane_id() - __step; \ + __bitmask_type bitmask = __gpu_lane_id() >= __step; \ + __x += __builtin_bit_cast( \ + __type, \ + -bitmask & __builtin_bit_cast(__bitmask_type, \ + __gpu_shuffle_idx_##__suffix( \ + __lane_mask, __index, __x))); \ + } \ + return __x; \ + } +__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x) +__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x) +__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x) +__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x) +#undef __DO_LANE_SCAN + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#if !defined(__cplusplus) +_Pragma("pop_macro(\"bool\")"); +#endif + +#undef _DEFAULT_FN_ATTRS + +#endif // __GPUINTRIN_H diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h new file mode 100644 index 0000000000000..8b68b807cac4f --- /dev/null +++ b/clang/lib/Headers/nvptxintrin.h @@ -0,0 +1,201 @@ +//===-- nvptxintrin.h - NVPTX intrinsic functions -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __NVPTXINTRIN_H +#define __NVPTXINTRIN_H + +#ifndef __NVPTX__ +#error "This file is intended for NVPTX targets or offloading to NVPTX" +#endif + +#include + +#if !defined(__cplusplus) +_Pragma("push_macro(\"bool\")"); +#define bool _Bool +#endif + +_Pragma("omp begin declare target device_type(nohost)"); +_Pragma("omp begin declare variant match(device = {arch(nvptx64)})"); + +// Type aliases to the address spaces used by the NVPTX backend. +#define __gpu_private __attribute__((opencl_private)) +#define __gpu_constant __attribute__((opencl_constant)) +#define __gpu_local __attribute__((opencl_local)) +#define __gpu_global __attribute__((opencl_global)) +#define __gpu_generic __attribute__((opencl_generic)) + +// Attribute to declare a function as a kernel. +#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected"))) + +// Returns the number of CUDA blocks in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { + return __nvvm_read_ptx_sreg_nctaid_x(); +} + +// Returns the number of CUDA blocks in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { + return __nvvm_read_ptx_sreg_nctaid_y(); +} + +// Returns the number of CUDA blocks in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { + return __nvvm_read_ptx_sreg_nctaid_z(); +} + +// Returns the 'x' dimension of the current CUDA block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) { + return __nvvm_read_ptx_sreg_ctaid_x(); +} + +// Returns the 'y' dimension of the current CUDA block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) { + return __nvvm_read_ptx_sreg_ctaid_y(); +} + +// Returns the 'z' dimension of the current CUDA block's id. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { + return __nvvm_read_ptx_sreg_ctaid_z(); +} + +// Returns the number of CUDA threads in the 'x' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { + return __nvvm_read_ptx_sreg_ntid_x(); +} + +// Returns the number of CUDA threads in the 'y' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { + return __nvvm_read_ptx_sreg_ntid_y(); +} + +// Returns the number of CUDA threads in the 'z' dimension. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { + return __nvvm_read_ptx_sreg_ntid_z(); +} + +// Returns the 'x' dimension id of the thread in the current CUDA block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) { + return __nvvm_read_ptx_sreg_tid_x(); +} + +// Returns the 'y' dimension id of the thread in the current CUDA block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) { + return __nvvm_read_ptx_sreg_tid_y(); +} + +// Returns the 'z' dimension id of the thread in the current CUDA block. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) { + return __nvvm_read_ptx_sreg_tid_z(); +} + +// Returns the size of a CUDA warp, always 32 on NVIDIA hardware. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) { + return __nvvm_read_ptx_sreg_warpsize(); +} + +// Returns the id of the thread inside of a CUDA warp executing together. +_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) { + return __nvvm_read_ptx_sreg_laneid(); +} + +// Returns the bit-mask of active threads in the current warp. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) { + return __nvvm_activemask(); +} + +// Copies the value from the first active thread in the warp to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __mask = (uint32_t)__lane_mask; + uint32_t __id = __builtin_ffs(__mask) - 1; + return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1); +} + +// Copies the value from the first active thread in the warp to the rest. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); + uint32_t __mask = (uint32_t)__lane_mask; + uint32_t __id = __builtin_ffs(__mask) - 1; + return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id, + __gpu_num_lanes() - 1) + << 32ull) | + ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id, + __gpu_num_lanes() - 1)); +} + +// Returns a bitmask of threads in the current lane for which \p x is true. +_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask, + bool __x) { + uint32_t __mask = (uint32_t)__lane_mask; + return __nvvm_vote_ballot_sync(__mask, __x); +} + +// Waits for all the threads in the block to converge and issues a fence. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) { + __syncthreads(); +} + +// Waits for all threads in the warp to reconverge for independent scheduling. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { + __nvvm_bar_warp_sync((uint32_t)__lane_mask); +} + +// Shuffles the the lanes inside the warp according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint32_t +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { + uint32_t __mask = (uint32_t)__lane_mask; + uint32_t __bitmask = (__mask >> __idx) & 1u; + return -__bitmask & + __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); +} + +// Shuffles the the lanes inside the warp according to the given index. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { + uint32_t __hi = (uint32_t)(__x >> 32ull); + uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); + uint32_t __mask = (uint32_t)__lane_mask; + uint64_t __bitmask = (__mask >> __idx) & 1u; + return -__bitmask & ((uint64_t)__nvvm_shfl_sync_idx_i32( + __mask, __hi, __idx, __gpu_num_lanes() - 1u) + << 32ull) | + ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx, + __gpu_num_lanes() - 1u)); +} + +// Returns true if the flat pointer points to CUDA 'shared' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { + return __nvvm_isspacep_shared(ptr); +} + +// Returns true if the flat pointer points to CUDA 'local' memory. +_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) { + return __nvvm_isspacep_local(ptr); +} + +// Terminates execution of the calling thread. +_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) { + __nvvm_exit(); +} + +// Suspend the thread briefly to assist the scheduler during busy loops. +_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) { + if (__nvvm_reflect("__CUDA_ARCH") >= 700) + asm("nanosleep.u32 64;" ::: "memory"); +} + +_Pragma("omp end declare variant"); +_Pragma("omp end declare target"); + +#if !defined(__cplusplus) +_Pragma("pop_macro(\"bool\")"); +#endif + +#endif // __NVPTXINTRIN_H diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c new file mode 100644 index 0000000000000..2e45f73692f53 --- /dev/null +++ b/clang/test/Headers/gpuintrin.c @@ -0,0 +1,107 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=AMDGPU +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -target-feature +ptx62 \ +// RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=NVPTX + +#include + +// AMDGPU-LABEL: define protected amdgpu_kernel void @foo( +// AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7:[0-9]+]] +// AMDGPU-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]] +// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]] +// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]] +// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]] +// AMDGPU-NEXT: unreachable +// +// NVPTX-LABEL: define protected void @foo( +// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]] +// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]] +// NVPTX-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]] +// NVPTX-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]] +// NVPTX-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]] +// NVPTX-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] +// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]] +// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]] +// NVPTX-NEXT: unreachable +// +__gpu_kernel void foo() { + __gpu_num_blocks_x(); + __gpu_num_blocks_y(); + __gpu_num_blocks_z(); + __gpu_num_blocks(0); + __gpu_block_id_x(); + __gpu_block_id_y(); + __gpu_block_id_z(); + __gpu_block_id(0); + __gpu_num_threads_x(); + __gpu_num_threads_y(); + __gpu_num_threads_z(); + __gpu_num_threads(0); + __gpu_thread_id_x(); + __gpu_thread_id_y(); + __gpu_thread_id_z(); + __gpu_thread_id(0); + __gpu_num_lanes(); + __gpu_lane_id(); + __gpu_lane_mask(); + __gpu_read_first_lane_u32(-1, -1); + __gpu_ballot(-1, 1); + __gpu_sync_threads(); + __gpu_sync_lane(-1); + __gpu_shuffle_idx_u32(-1, -1, -1); + __gpu_first_lane_id(-1); + __gpu_is_first_in_lane(-1); + __gpu_exit(); +} diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c new file mode 100644 index 0000000000000..b2dfc9d40827d --- /dev/null +++ b/clang/test/Headers/gpuintrin_lang.c @@ -0,0 +1,76 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -fcuda-is-device -triple nvptx64 -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=CUDA +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/cuda_wrappers \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -fcuda-is-device -triple amdgcn -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=HIP +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ \ +// RUN: -cl-std=CL3.0 -triple amdgcn -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=OPENCL +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -internal-isystem %S/../../lib/Headers/ -cl-std=CL3.0 \ +// RUN: -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \ +// RUN: -fopenmp-is-target-device -triple amdgcn -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=OPENMP +// +// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \ +// RUN: -std=c89 -internal-isystem %S/../../lib/Headers/ \ +// RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \ +// RUN: | FileCheck %s --check-prefix=C89 + +#define _DEFAULT_FN_ATTRS __attribute__((always_inline)) +#include + +#ifdef __device__ +__device__ int foo() { return __gpu_thread_id_x(); } +#else +// CUDA-LABEL: define dso_local i32 @foo( +// CUDA-SAME: ) #[[ATTR0:[0-9]+]] { +// CUDA-NEXT: [[ENTRY:.*:]] +// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CUDA-NEXT: ret i32 [[TMP0]] +// +// HIP-LABEL: define dso_local i32 @foo( +// HIP-SAME: ) #[[ATTR0:[0-9]+]] { +// HIP-NEXT: [[ENTRY:.*:]] +// HIP-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5) +// HIP-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// HIP-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// HIP-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr +// HIP-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// HIP-NEXT: ret i32 [[TMP0]] +// +// OPENCL-LABEL: define dso_local i32 @foo( +// OPENCL-SAME: ) #[[ATTR0:[0-9]+]] { +// OPENCL-NEXT: [[ENTRY:.*:]] +// OPENCL-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// OPENCL-NEXT: ret i32 [[TMP0]] +// +// OPENMP-LABEL: define hidden i32 @foo( +// OPENMP-SAME: ) #[[ATTR0:[0-9]+]] { +// OPENMP-NEXT: [[ENTRY:.*:]] +// OPENMP-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// OPENMP-NEXT: ret i32 [[TMP0]] +// +// C89-LABEL: define dso_local i32 @foo( +// C89-SAME: ) #[[ATTR2:[0-9]+]] { +// C89-NEXT: [[ENTRY:.*:]] +// C89-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5) +// C89-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// C89-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// C89-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr +// C89-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() +// C89-NEXT: ret i32 [[TMP0]] +// +int foo() { return __gpu_thread_id_x(); } +#pragma omp declare target to(foo) +#endif