From 561c17b006e6546f659cd9f12dc4ebfb7ed4077c Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 31 Mar 2025 00:22:44 -0700 Subject: [PATCH 1/6] [SYCL] Move Subgroup*INTEL SPIR-V built-in declarations to clang SPIRVBuiltins.td Motivation is similar to c040f9a137f2 : unify with SPV-IR mangling. SubgroupBlockReadINTEL is not handled in this PR. SPV-IR appends return type to mangled function name. It might be simpler for SYCL header to keep current mangling. --- clang/lib/Sema/SPIRVBuiltins.td | 36 + .../spirv-builtin-lookup-intel-subgroup.cpp | 252 ++++++ .../lib/amdgcn-amdhsa/group/collectives.cl | 264 ++++--- .../amdgcn-amdhsa/misc/sub_group_shuffle.cl | 727 ++++++++---------- libdevice/nativecpu_utils.cpp | 20 +- .../FixABIMuxBuiltinsSYCLNativeCPU.cpp | 12 +- sycl/include/sycl/__spirv/spirv_ops.hpp | 55 -- .../group_load_store_alignment.cpp | 4 +- .../group_load_store_native_key.cpp | 6 +- sycl/test/check_device_code/group_shuffle.cpp | 10 +- sycl/test/check_device_code/group_store.cpp | 32 +- .../native_cpu/shuffle_abi.cpp | 10 +- 12 files changed, 791 insertions(+), 637 deletions(-) create mode 100644 clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp diff --git a/clang/lib/Sema/SPIRVBuiltins.td b/clang/lib/Sema/SPIRVBuiltins.td index bba559f0f255a..4f7bd7d182d19 100644 --- a/clang/lib/Sema/SPIRVBuiltins.td +++ b/clang/lib/Sema/SPIRVBuiltins.td @@ -984,6 +984,42 @@ foreach name = ["GroupLogicalAndKHR", "GroupLogicalOrKHR"] in { def : SPVBuiltin; } +def SubgroupShuffleINTELVecType + : GenericType<"SubgroupShuffleINTELVecType", + TypeList<[Char, UChar, Short, UShort, Int, UInt, Float]>, + VecNoScalar>; + +foreach name = ["SubgroupShuffleINTEL", "SubgroupShuffleXorINTEL"] in { + def : SPVBuiltin; + def : SPVBuiltin; +} + +foreach name = ["SubgroupShuffleUpINTEL", "SubgroupShuffleDownINTEL"] in { + def : SPVBuiltin; + def : SPVBuiltin; +} + +foreach name = ["SubgroupBlockWriteINTEL"] in { + foreach AS = [GlobalAS, LocalAS] in { + foreach Ty = TLUnsignedInts.List in { + def : SPVBuiltin, Ty]>; + } + foreach Ty = [UChar, UShort] in { + foreach v = [2, 4, 8, 16] in { + def : SPVBuiltin, VectorType]>; + } + } + foreach Ty = [UInt, ULong] in { + foreach v = [2, 4, 8] in { + def : SPVBuiltin, VectorType]>; + } + } + } +} + // 3.56.24. Non-Uniform Instructions foreach name = ["GroupNonUniformElect"] in { diff --git a/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp b/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp new file mode 100644 index 0000000000000..13c88e25bc988 --- /dev/null +++ b/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp @@ -0,0 +1,252 @@ +// RUN: %clang_cc1 -triple=spir64 -fdeclare-spirv-builtins -emit-llvm %s -o - | FileCheck %s + +template void test_shuffle() { + T v; + unsigned int id; + __spirv_SubgroupShuffleINTEL(v, id); + __spirv_SubgroupShuffleXorINTEL(v, id); + __spirv_SubgroupShuffleUpINTEL(v, v, id); + __spirv_SubgroupShuffleDownINTEL(v, v, id); +} + +template void test_shuffle_scalar_and_vector() { + test_shuffle(); + test_shuffle(); + test_shuffle(); + test_shuffle(); + test_shuffle(); + test_shuffle(); +} + +template void test_block_write_addrspace(PtrTy ptr) { + T v; + __spirv_SubgroupBlockWriteINTEL(ptr, v); + using T2 = T __attribute__((ext_vector_type(2))); + T2 v2; + __spirv_SubgroupBlockWriteINTEL(ptr, v2); + using T4 = T __attribute__((ext_vector_type(4))); + T4 v4; + __spirv_SubgroupBlockWriteINTEL(ptr, v4); + using T8 = T __attribute__((ext_vector_type(8))); + T8 v8; + __spirv_SubgroupBlockWriteINTEL(ptr, v8); +} + +template void test_block_write_addrspace_v16(PtrTy ptr) { + using T16 = T __attribute__((ext_vector_type(16))); + T16 v16; + __spirv_SubgroupBlockWriteINTEL(ptr, v16); +} + +template void test_block_write() { + __attribute__((opencl_global)) T * gptr; + test_block_write_addrspace(gptr); + __attribute__((opencl_local)) T * lptr; + test_block_write_addrspace(lptr); +} + +template void test_block_write_v16() { + __attribute__((opencl_global)) T * gptr; + test_block_write_addrspace_v16(gptr); + __attribute__((opencl_local)) T * lptr; + test_block_write_addrspace_v16(lptr); +} + +void test() { + test_shuffle<_Float16>(); + test_shuffle(); + + test_shuffle_scalar_and_vector(); + test_shuffle_scalar_and_vector(); + test_shuffle_scalar_and_vector(); + test_shuffle_scalar_and_vector(); + test_shuffle_scalar_and_vector(); + test_shuffle_scalar_and_vector(); + + test_block_write(); + test_block_write_v16(); + test_block_write(); + test_block_write_v16(); + test_block_write(); +} + +// CHECK: call spir_func noundef half @_Z28__spirv_SubgroupShuffleINTELDF16_j +// CHECK: call spir_func noundef half @_Z31__spirv_SubgroupShuffleXorINTELDF16_j +// CHECK: call spir_func noundef half @_Z30__spirv_SubgroupShuffleUpINTELDF16_DF16_j +// CHECK: call spir_func noundef half @_Z32__spirv_SubgroupShuffleDownINTELDF16_DF16_j +// CHECK: call spir_func noundef double @_Z28__spirv_SubgroupShuffleINTELdj +// CHECK: call spir_func noundef double @_Z31__spirv_SubgroupShuffleXorINTELdj +// CHECK: call spir_func noundef double @_Z30__spirv_SubgroupShuffleUpINTELddj +// CHECK: call spir_func noundef double @_Z32__spirv_SubgroupShuffleDownINTELddj +// CHECK: call spir_func noundef zeroext i8 @_Z28__spirv_SubgroupShuffleINTELhj +// CHECK: call spir_func noundef zeroext i8 @_Z31__spirv_SubgroupShuffleXorINTELhj +// CHECK: call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupShuffleUpINTELhhj +// CHECK: call spir_func noundef zeroext i8 @_Z32__spirv_SubgroupShuffleDownINTELhhj +// CHECK: call spir_func noundef <2 x i8> @_Z28__spirv_SubgroupShuffleINTELDv2_hj +// CHECK: call spir_func noundef <2 x i8> @_Z31__spirv_SubgroupShuffleXorINTELDv2_hj +// CHECK: call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupShuffleUpINTELDv2_hS_j +// CHECK: call spir_func noundef <2 x i8> @_Z32__spirv_SubgroupShuffleDownINTELDv2_hS_j +// CHECK: call spir_func noundef <3 x i8> @_Z28__spirv_SubgroupShuffleINTELDv3_hj +// CHECK: call spir_func noundef <3 x i8> @_Z31__spirv_SubgroupShuffleXorINTELDv3_hj +// CHECK: call spir_func noundef <3 x i8> @_Z30__spirv_SubgroupShuffleUpINTELDv3_hS_j +// CHECK: call spir_func noundef <3 x i8> @_Z32__spirv_SubgroupShuffleDownINTELDv3_hS_j +// CHECK: call spir_func noundef <4 x i8> @_Z28__spirv_SubgroupShuffleINTELDv4_hj +// CHECK: call spir_func noundef <4 x i8> @_Z31__spirv_SubgroupShuffleXorINTELDv4_hj +// CHECK: call spir_func noundef <4 x i8> @_Z30__spirv_SubgroupShuffleUpINTELDv4_hS_j +// CHECK: call spir_func noundef <4 x i8> @_Z32__spirv_SubgroupShuffleDownINTELDv4_hS_j +// CHECK: call spir_func noundef <8 x i8> @_Z28__spirv_SubgroupShuffleINTELDv8_hj +// CHECK: call spir_func noundef <8 x i8> @_Z31__spirv_SubgroupShuffleXorINTELDv8_hj +// CHECK: call spir_func noundef <8 x i8> @_Z30__spirv_SubgroupShuffleUpINTELDv8_hS_j +// CHECK: call spir_func noundef <8 x i8> @_Z32__spirv_SubgroupShuffleDownINTELDv8_hS_j +// CHECK: call spir_func noundef <16 x i8> @_Z28__spirv_SubgroupShuffleINTELDv16_hj +// CHECK: call spir_func noundef <16 x i8> @_Z31__spirv_SubgroupShuffleXorINTELDv16_hj +// CHECK: call spir_func noundef <16 x i8> @_Z30__spirv_SubgroupShuffleUpINTELDv16_hS_j +// CHECK: call spir_func noundef <16 x i8> @_Z32__spirv_SubgroupShuffleDownINTELDv16_hS_j +// CHECK: call spir_func noundef i32 @_Z28__spirv_SubgroupShuffleINTELij +// CHECK: call spir_func noundef i32 @_Z31__spirv_SubgroupShuffleXorINTELij +// CHECK: call spir_func noundef i32 @_Z30__spirv_SubgroupShuffleUpINTELiij +// CHECK: call spir_func noundef i32 @_Z32__spirv_SubgroupShuffleDownINTELiij +// CHECK: call spir_func noundef <2 x i32> @_Z28__spirv_SubgroupShuffleINTELDv2_ij +// CHECK: call spir_func noundef <2 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv2_ij +// CHECK: call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv2_iS_j +// CHECK: call spir_func noundef <2 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv2_iS_j +// CHECK: call spir_func noundef <3 x i32> @_Z28__spirv_SubgroupShuffleINTELDv3_ij +// CHECK: call spir_func noundef <3 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv3_ij +// CHECK: call spir_func noundef <3 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv3_iS_j +// CHECK: call spir_func noundef <3 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv3_iS_j +// CHECK: call spir_func noundef <4 x i32> @_Z28__spirv_SubgroupShuffleINTELDv4_ij +// CHECK: call spir_func noundef <4 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv4_ij +// CHECK: call spir_func noundef <4 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv4_iS_j +// CHECK: call spir_func noundef <4 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv4_iS_j +// CHECK: call spir_func noundef <8 x i32> @_Z28__spirv_SubgroupShuffleINTELDv8_ij +// CHECK: call spir_func noundef <8 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv8_ij +// CHECK: call spir_func noundef <8 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv8_iS_j +// CHECK: call spir_func noundef <8 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv8_iS_j +// CHECK: call spir_func noundef <16 x i32> @_Z28__spirv_SubgroupShuffleINTELDv16_ij +// CHECK: call spir_func noundef <16 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv16_ij +// CHECK: call spir_func noundef <16 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv16_iS_j +// CHECK: call spir_func noundef <16 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv16_iS_j +// CHECK: call spir_func noundef signext i16 @_Z28__spirv_SubgroupShuffleINTELsj +// CHECK: call spir_func noundef signext i16 @_Z31__spirv_SubgroupShuffleXorINTELsj +// CHECK: call spir_func noundef signext i16 @_Z30__spirv_SubgroupShuffleUpINTELssj +// CHECK: call spir_func noundef signext i16 @_Z32__spirv_SubgroupShuffleDownINTELssj +// CHECK: call spir_func noundef <2 x i16> @_Z28__spirv_SubgroupShuffleINTELDv2_sj +// CHECK: call spir_func noundef <2 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv2_sj +// CHECK: call spir_func noundef <2 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv2_sS_j +// CHECK: call spir_func noundef <2 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv2_sS_j +// CHECK: call spir_func noundef <3 x i16> @_Z28__spirv_SubgroupShuffleINTELDv3_sj +// CHECK: call spir_func noundef <3 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv3_sj +// CHECK: call spir_func noundef <3 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv3_sS_j +// CHECK: call spir_func noundef <3 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv3_sS_j +// CHECK: call spir_func noundef <4 x i16> @_Z28__spirv_SubgroupShuffleINTELDv4_sj +// CHECK: call spir_func noundef <4 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv4_sj +// CHECK: call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv4_sS_j +// CHECK: call spir_func noundef <4 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv4_sS_j +// CHECK: call spir_func noundef <8 x i16> @_Z28__spirv_SubgroupShuffleINTELDv8_sj +// CHECK: call spir_func noundef <8 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv8_sj +// CHECK: call spir_func noundef <8 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv8_sS_j +// CHECK: call spir_func noundef <8 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv8_sS_j +// CHECK: call spir_func noundef <16 x i16> @_Z28__spirv_SubgroupShuffleINTELDv16_sj +// CHECK: call spir_func noundef <16 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv16_sj +// CHECK: call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv16_sS_j +// CHECK: call spir_func noundef <16 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv16_sS_j +// CHECK: call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELtj +// CHECK: call spir_func noundef zeroext i16 @_Z31__spirv_SubgroupShuffleXorINTELtj +// CHECK: call spir_func noundef zeroext i16 @_Z30__spirv_SubgroupShuffleUpINTELttj +// CHECK: call spir_func noundef zeroext i16 @_Z32__spirv_SubgroupShuffleDownINTELttj +// CHECK: call spir_func noundef <2 x i16> @_Z28__spirv_SubgroupShuffleINTELDv2_tj +// CHECK: call spir_func noundef <2 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv2_tj +// CHECK: call spir_func noundef <2 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv2_tS_j +// CHECK: call spir_func noundef <2 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv2_tS_j +// CHECK: call spir_func noundef <3 x i16> @_Z28__spirv_SubgroupShuffleINTELDv3_tj +// CHECK: call spir_func noundef <3 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv3_tj +// CHECK: call spir_func noundef <3 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv3_tS_j +// CHECK: call spir_func noundef <3 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv3_tS_j +// CHECK: call spir_func noundef <4 x i16> @_Z28__spirv_SubgroupShuffleINTELDv4_tj +// CHECK: call spir_func noundef <4 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv4_tj +// CHECK: call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv4_tS_j +// CHECK: call spir_func noundef <4 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv4_tS_j +// CHECK: call spir_func noundef <8 x i16> @_Z28__spirv_SubgroupShuffleINTELDv8_tj +// CHECK: call spir_func noundef <8 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv8_tj +// CHECK: call spir_func noundef <8 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv8_tS_j +// CHECK: call spir_func noundef <8 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv8_tS_j +// CHECK: call spir_func noundef <16 x i16> @_Z28__spirv_SubgroupShuffleINTELDv16_tj +// CHECK: call spir_func noundef <16 x i16> @_Z31__spirv_SubgroupShuffleXorINTELDv16_tj +// CHECK: call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupShuffleUpINTELDv16_tS_j +// CHECK: call spir_func noundef <16 x i16> @_Z32__spirv_SubgroupShuffleDownINTELDv16_tS_j +// CHECK: call spir_func noundef i32 @_Z28__spirv_SubgroupShuffleINTELjj +// CHECK: call spir_func noundef i32 @_Z31__spirv_SubgroupShuffleXorINTELjj +// CHECK: call spir_func noundef i32 @_Z30__spirv_SubgroupShuffleUpINTELjjj +// CHECK: call spir_func noundef i32 @_Z32__spirv_SubgroupShuffleDownINTELjjj +// CHECK: call spir_func noundef <2 x i32> @_Z28__spirv_SubgroupShuffleINTELDv2_jj +// CHECK: call spir_func noundef <2 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv2_jj +// CHECK: call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv2_jS_j +// CHECK: call spir_func noundef <2 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv2_jS_j +// CHECK: call spir_func noundef <3 x i32> @_Z28__spirv_SubgroupShuffleINTELDv3_jj +// CHECK: call spir_func noundef <3 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv3_jj +// CHECK: call spir_func noundef <3 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv3_jS_j +// CHECK: call spir_func noundef <3 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv3_jS_j +// CHECK: call spir_func noundef <4 x i32> @_Z28__spirv_SubgroupShuffleINTELDv4_jj +// CHECK: call spir_func noundef <4 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv4_jj +// CHECK: call spir_func noundef <4 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv4_jS_j +// CHECK: call spir_func noundef <4 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv4_jS_j +// CHECK: call spir_func noundef <8 x i32> @_Z28__spirv_SubgroupShuffleINTELDv8_jj +// CHECK: call spir_func noundef <8 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv8_jj +// CHECK: call spir_func noundef <8 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv8_jS_j +// CHECK: call spir_func noundef <8 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv8_jS_j +// CHECK: call spir_func noundef <16 x i32> @_Z28__spirv_SubgroupShuffleINTELDv16_jj +// CHECK: call spir_func noundef <16 x i32> @_Z31__spirv_SubgroupShuffleXorINTELDv16_jj +// CHECK: call spir_func noundef <16 x i32> @_Z30__spirv_SubgroupShuffleUpINTELDv16_jS_j +// CHECK: call spir_func noundef <16 x i32> @_Z32__spirv_SubgroupShuffleDownINTELDv16_jS_j +// CHECK: call spir_func noundef float @_Z28__spirv_SubgroupShuffleINTELfj +// CHECK: call spir_func noundef float @_Z31__spirv_SubgroupShuffleXorINTELfj +// CHECK: call spir_func noundef float @_Z30__spirv_SubgroupShuffleUpINTELffj +// CHECK: call spir_func noundef float @_Z32__spirv_SubgroupShuffleDownINTELffj +// CHECK: call spir_func noundef <2 x float> @_Z28__spirv_SubgroupShuffleINTELDv2_fj +// CHECK: call spir_func noundef <2 x float> @_Z31__spirv_SubgroupShuffleXorINTELDv2_fj +// CHECK: call spir_func noundef <2 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv2_fS_j +// CHECK: call spir_func noundef <2 x float> @_Z32__spirv_SubgroupShuffleDownINTELDv2_fS_j +// CHECK: call spir_func noundef <3 x float> @_Z28__spirv_SubgroupShuffleINTELDv3_fj +// CHECK: call spir_func noundef <3 x float> @_Z31__spirv_SubgroupShuffleXorINTELDv3_fj +// CHECK: call spir_func noundef <3 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv3_fS_j +// CHECK: call spir_func noundef <3 x float> @_Z32__spirv_SubgroupShuffleDownINTELDv3_fS_j +// CHECK: call spir_func noundef <4 x float> @_Z28__spirv_SubgroupShuffleINTELDv4_fj +// CHECK: call spir_func noundef <4 x float> @_Z31__spirv_SubgroupShuffleXorINTELDv4_fj +// CHECK: call spir_func noundef <4 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv4_fS_j +// CHECK: call spir_func noundef <4 x float> @_Z32__spirv_SubgroupShuffleDownINTELDv4_fS_j +// CHECK: call spir_func noundef <8 x float> @_Z28__spirv_SubgroupShuffleINTELDv8_fj +// CHECK: call spir_func noundef <8 x float> @_Z31__spirv_SubgroupShuffleXorINTELDv8_fj +// CHECK: call spir_func noundef <8 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv8_fS_j +// CHECK: call spir_func noundef <8 x float> @_Z32__spirv_SubgroupShuffleDownINTELDv8_fS_j +// CHECK: call spir_func noundef <16 x float> @_Z28__spirv_SubgroupShuffleINTELDv16_fj +// CHECK: call spir_func noundef <16 x float> @_Z31__spirv_SubgroupShuffleXorINTELDv16_fj +// CHECK: call spir_func noundef <16 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv16_fS_j +// CHECK: call spir_func noundef <16 x float> @_Z32__spirv_SubgroupShuffleDownINTELDv16_fS_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1hh +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1hDv2_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1hDv4_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1hDv8_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3hh +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3hDv2_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3hDv4_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3hDv8_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1hDv16_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3hDv16_h +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tt +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv2_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv4_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv8_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tt +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv2_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv4_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv8_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv16_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv16_t +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jDv2_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jDv4_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jDv8_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jj +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jDv2_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jDv4_j +// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jDv8_j diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl index 27218592b2487..9862019a41b72 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl @@ -38,24 +38,23 @@ __clc__get_group_scratch_float() __asm("__clc__get_group_scratch_float"); __local double * __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double"); -#define __CLC_DECLARE_SHUFFLES(TYPE, TYPE_MANGLED) \ - _CLC_DECL TYPE _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \ - TYPE, int); \ - _CLC_DECL TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \ - TYPE, TYPE, unsigned int); - -__CLC_DECLARE_SHUFFLES(char, a); -__CLC_DECLARE_SHUFFLES(unsigned char, h); -__CLC_DECLARE_SHUFFLES(short, s); -__CLC_DECLARE_SHUFFLES(unsigned short, t); -__CLC_DECLARE_SHUFFLES(int, i); -__CLC_DECLARE_SHUFFLES(unsigned int, j); -__CLC_DECLARE_SHUFFLES(half, DF16_); -__CLC_DECLARE_SHUFFLES(float, f); -__CLC_DECLARE_SHUFFLES(long, l); -__CLC_DECLARE_SHUFFLES(unsigned long, m); -__CLC_DECLARE_SHUFFLES(double, d); +#define __CLC_DECLARE_SHUFFLES(TYPE) \ + _CLC_OVERLOAD _CLC_DECL TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE, unsigned int); \ + _CLC_OVERLOAD _CLC_DECL TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE, TYPE, unsigned int); + +__CLC_DECLARE_SHUFFLES(char); +__CLC_DECLARE_SHUFFLES(unsigned char); +__CLC_DECLARE_SHUFFLES(short); +__CLC_DECLARE_SHUFFLES(unsigned short); +__CLC_DECLARE_SHUFFLES(int); +__CLC_DECLARE_SHUFFLES(unsigned int); +__CLC_DECLARE_SHUFFLES(half); +__CLC_DECLARE_SHUFFLES(float); +__CLC_DECLARE_SHUFFLES(long); +__CLC_DECLARE_SHUFFLES(unsigned long); +__CLC_DECLARE_SHUFFLES(double); #undef __CLC_DECLARE_SHUFFLES @@ -71,125 +70,122 @@ __CLC_DECLARE_SHUFFLES(double, d); #define __CLC_LOGICAL_OR(x, y) (x || y) #define __CLC_LOGICAL_AND(x, y) (x && y) -#define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, TYPE_MANGLED, IDENTITY) \ - uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ - /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ - for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \ - TYPE contribution = \ - _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j(x, x, \ - o); \ - bool inactive = (sg_lid < o); \ - contribution = (inactive) ? IDENTITY : contribution; \ - x = OP(x, contribution); \ - } \ - /* For Reduce, broadcast result from highest active lane */ \ - TYPE result; \ - if (op == Reduce) { \ - result = _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \ - x, __spirv_SubgroupSize() - 1); \ - *carry = result; \ - } /* For InclusiveScan, use results as computed */ \ - else if (op == InclusiveScan) { \ - result = x; \ - *carry = result; \ - } /* For ExclusiveScan, shift and prepend identity */ \ - else if (op == ExclusiveScan) { \ - *carry = x; \ - result = _Z30__spirv_SubgroupShuffleUpINTELI##TYPE_MANGLED##ET_S0_S0_j( \ - x, x, 1); \ - if (sg_lid == 0) { \ - result = IDENTITY; \ - } \ - } \ +#define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ + uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ + /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ + for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \ + TYPE contribution = __spirv_SubgroupShuffleUpINTEL(x, x, o); \ + bool inactive = (sg_lid < o); \ + contribution = (inactive) ? IDENTITY : contribution; \ + x = OP(x, contribution); \ + } \ + /* For Reduce, broadcast result from highest active lane */ \ + TYPE result; \ + if (op == Reduce) { \ + result = __spirv_SubgroupShuffleINTEL( \ + x, __spirv_SubgroupSize() - 1); \ + *carry = result; \ + } /* For InclusiveScan, use results as computed */ \ + else if (op == InclusiveScan) { \ + result = x; \ + *carry = result; \ + } /* For ExclusiveScan, shift and prepend identity */ \ + else if (op == ExclusiveScan) { \ + *carry = x; \ + result = __spirv_SubgroupShuffleUpINTEL(x, x, 1); \ + if (sg_lid == 0) { \ + result = IDENTITY; \ + } \ + } \ return result; -#define __CLC_SUBGROUP_COLLECTIVE(NAME, OP, TYPE, TYPE_MANGLED, IDENTITY) \ - _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ - __clc__Subgroup, NAME)(uint op, TYPE x, TYPE * carry) { \ - __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, TYPE_MANGLED, IDENTITY) \ +#define __CLC_SUBGROUP_COLLECTIVE(NAME, OP, TYPE, IDENTITY) \ + _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ + __clc__Subgroup, NAME)(uint op, TYPE x, TYPE * carry) { \ + __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ } -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, char, a, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, uchar, h, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, short, s, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, ushort, t, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, int, i, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, uint, j, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, long, l, 0) -__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, ulong, m, 0) -__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, half, DF16_, 0) -__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, float, f, 0) -__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, double, d, 0) - -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, char, a, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, uchar, h, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, short, s, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, ushort, t, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, int, i, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, uint, j, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, long, l, 1) -__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, ulong, m, 1) -__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, DF16_, 1) -__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, f, 1) -__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, d, 1) - -__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, char, a, CHAR_MAX) -__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, h, UCHAR_MAX) -__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, short, s, SHRT_MAX) -__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, ushort, t, USHRT_MAX) -__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, int, i, INT_MAX) -__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uint, j, UINT_MAX) -__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, long, l, LONG_MAX) -__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, ulong, m, ULONG_MAX) -__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, half, DF16_, INFINITY) -__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, float, f, INFINITY) -__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, double, d, INFINITY) - -__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, char, a, CHAR_MIN) -__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, uchar, h, 0) -__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, short, s, SHRT_MIN) -__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, ushort, t, 0) -__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, int, i, INT_MIN) -__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, uint, j, 0) -__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, long, l, LONG_MIN) -__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, ulong, m, 0) -__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, half, DF16_, -INFINITY) -__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, float, f, -INFINITY) -__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, d, -INFINITY) - -__CLC_SUBGROUP_COLLECTIVE(All, __CLC_AND, bool, a, true) -__CLC_SUBGROUP_COLLECTIVE(Any, __CLC_OR, bool, a, false) - -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, h, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, h, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, h, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, a, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, a, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, a, 0) - -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, t, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, t, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, t, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, s, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, s, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, s, 0) - -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, j, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, j, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, j, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, i, ~0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, i, 0) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, i, 0) - -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, m, ~0l) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, m, 0l) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, m, 0l) -__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, l, ~0l) -__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, l, 0l) -__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, l, 0l) - -__CLC_SUBGROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, a, false) -__CLC_SUBGROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, a, true) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, char, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, uchar, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, short, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, ushort, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, int, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, uint, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, long, 0) +__CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, ulong, 0) +__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, half, 0) +__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, float, 0) +__CLC_SUBGROUP_COLLECTIVE(FAdd, __CLC_ADD, double, 0) + +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, char, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, uchar, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, short, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, ushort, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, int, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, uint, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, long, 1) +__CLC_SUBGROUP_COLLECTIVE(IMulKHR, __CLC_MUL, ulong, 1) +__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, 1) +__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, 1) +__CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, 1) + +__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, char, CHAR_MAX) +__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, UCHAR_MAX) +__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, short, SHRT_MAX) +__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, ushort, USHRT_MAX) +__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, int, INT_MAX) +__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uint, UINT_MAX) +__CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, long, LONG_MAX) +__CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, ulong, ULONG_MAX) +__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, half, INFINITY) +__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, float, INFINITY) +__CLC_SUBGROUP_COLLECTIVE(FMin, __CLC_MIN, double, INFINITY) + +__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, char, CHAR_MIN) +__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, uchar, 0) +__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, short, SHRT_MIN) +__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, ushort, 0) +__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, int, INT_MIN) +__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, uint, 0) +__CLC_SUBGROUP_COLLECTIVE(SMax, __CLC_MAX, long, LONG_MIN) +__CLC_SUBGROUP_COLLECTIVE(UMax, __CLC_MAX, ulong, 0) +__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, half, -INFINITY) +__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, float, -INFINITY) +__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -INFINITY) + +__CLC_SUBGROUP_COLLECTIVE(All, __CLC_AND, bool, true) +__CLC_SUBGROUP_COLLECTIVE(Any, __CLC_OR, bool, false) + +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, 0) + +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, 0) + +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, ~0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, 0) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, 0) + +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, ~0l) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, 0l) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, 0l) +__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, ~0l) +__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, 0l) +__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l) + +__CLC_SUBGROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, false) +__CLC_SUBGROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, true) #undef __CLC_SUBGROUP_COLLECTIVE_BODY #undef __CLC_SUBGROUP_COLLECTIVE @@ -382,7 +378,7 @@ long __clc__3d_to_linear_local_id(ulong3 id) { _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ int scope, TYPE x, ulong local_id) { \ if (scope == Subgroup) { \ - return _Z28__spirv_SubgroupShuffleINTELI##TYPE_MANGLED##ET_S0_j( \ + return __spirv_SubgroupShuffleINTEL( \ x, local_id); \ } \ bool source = (__spirv_LocalInvocationIndex() == local_id); \ diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl index 4a69ffdd00709..3b4be56efd2f2 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl @@ -12,36 +12,29 @@ #define SUBGROUP_SIZE __spirv_SubgroupMaxSize() // Shuffle -// int __spirv_SubgroupShuffleINTEL(int, unsigned int) -_CLC_DEF int -_Z28__spirv_SubgroupShuffleINTELIiET_S0_j(int Data, unsigned int InvocationId) { +_CLC_OVERLOAD _CLC_DEF int +__spirv_SubgroupShuffleINTEL(int Data, unsigned int InvocationId) { int Index = InvocationId; return __builtin_amdgcn_ds_bpermute(Index << 2, Data); } // Sub 32-bit types. -// _Z28__spirv_SubgroupShuffleINTELIaET_S0_j - char -// _Z28__spirv_SubgroupShuffleINTELIhET_S0_j - unsigned char -// _Z28__spirv_SubgroupShuffleINTELIsET_S0_j - long -// _Z28__spirv_SubgroupShuffleINTELItET_S0_j - unsigned long -// _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j - half -#define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - return _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(Data, InvocationId); \ +#define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL((int)Data, InvocationId); \ } -__AMDGCN_CLC_SUBGROUP_SUB_I32(char, a); -__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned char, h); -__AMDGCN_CLC_SUBGROUP_SUB_I32(short, s); -__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned short, t); +__AMDGCN_CLC_SUBGROUP_SUB_I32(char); +__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned char); +__AMDGCN_CLC_SUBGROUP_SUB_I32(short); +__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned short); #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_DEF half _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j( +_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleINTEL( half Data, unsigned int InvocationId) { unsigned short tmp = as_ushort(Data); - tmp = (unsigned short)_Z28__spirv_SubgroupShuffleINTELIiET_S0_j((int)tmp, - InvocationId); + tmp = __spirv_SubgroupShuffleINTEL(tmp, InvocationId); return as_half(tmp); } #endif // cl_khr_fp16 @@ -49,109 +42,100 @@ _CLC_DEF half _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j( #undef __AMDGCN_CLC_SUBGROUP_SUB_I32 // 32-bit types. -// __spirv_SubgroupShuffleINTEL - unsigned int -// __spirv_SubgroupShuffleINTEL- float -#define __AMDGCN_CLC_SUBGROUP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - return __builtin_astype( \ - _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(as_int(Data), InvocationId), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + return __builtin_astype( \ + __spirv_SubgroupShuffleINTEL(as_int(Data), InvocationId), \ + CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_I32(unsigned int, uint, j); -__AMDGCN_CLC_SUBGROUP_I32(float, float, f); +__AMDGCN_CLC_SUBGROUP_I32(unsigned int, uint); +__AMDGCN_CLC_SUBGROUP_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_I32 // 64-bit types. -// __spirv_SubgroupShuffleINTEL - long -// __spirv_SubgroupShuffleINTEL - unsigned long -// __spirv_SubgroupShuffleINTEL - double -#define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - int2 tmp = as_int2(Data); \ - tmp.lo = _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(tmp.lo, InvocationId); \ - tmp.hi = _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(tmp.hi, InvocationId); \ - return __builtin_astype(tmp, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + int2 tmp = as_int2(Data); \ + tmp.lo = __spirv_SubgroupShuffleINTEL(tmp.lo, InvocationId); \ + tmp.hi = __spirv_SubgroupShuffleINTEL(tmp.hi, InvocationId); \ + return __builtin_astype(tmp, CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_I64(long, long, l); -__AMDGCN_CLC_SUBGROUP_I64(unsigned long, ulong, m); -__AMDGCN_CLC_SUBGROUP_I64(double, double, d); +__AMDGCN_CLC_SUBGROUP_I64(long, long); +__AMDGCN_CLC_SUBGROUP_I64(unsigned long, ulong); +__AMDGCN_CLC_SUBGROUP_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ - _CLC_DEF TYPE \ - _Z28__spirv_SubgroupShuffleINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_j( \ - TYPE Data, unsigned int InvocationId) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = _Z28__spirv_SubgroupShuffleINTELI##MANGLED_SCALAR_TY##ET_S0_j( \ - Data[i], InvocationId); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE \ + __spirv_SubgroupShuffleINTEL(TYPE Data, unsigned int InvocationId) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleINTEL(Data[i], InvocationId); \ + } \ + return res; \ } // [u]char -__AMDGCN_CLC_SUBGROUP_TO_VEC(char2, a, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(char4, a, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(char8, a, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(char16, a, 16) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar2, h, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar4, h, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar8, h, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar16, h, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char16, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar16, 16) // [u]short -__AMDGCN_CLC_SUBGROUP_TO_VEC(short2, s, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(short4, s, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(short8, s, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(short16, s, 16) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort2, t, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort4, t, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort8, t, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort16, t, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short16, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort16, 16) // [u]int -__AMDGCN_CLC_SUBGROUP_TO_VEC(int2, i, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(int4, i, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(int8, i, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(int16, i, 16) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uint2, j, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uint4, j, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uint8, j, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(uint16, j, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int16, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint16, 16) // [u]long -__AMDGCN_CLC_SUBGROUP_TO_VEC(long2, l, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(long4, l, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(long8, l, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(long16, l, 16) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong2, m, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong4, m, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong8, m, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong16, m, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long16, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong16, 16) // half #ifdef cl_khr_fp16 -__AMDGCN_CLC_SUBGROUP_TO_VEC(half2, DF16_, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(half4, DF16_, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(half8, DF16_, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(half16, DF16_, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half16, 16) #endif // cl_khr_fp16 // float -__AMDGCN_CLC_SUBGROUP_TO_VEC(float2, f, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(float4, f, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(float8, f, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(float16, f, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float16, 16) // double -__AMDGCN_CLC_SUBGROUP_TO_VEC(double2, d, 2) -__AMDGCN_CLC_SUBGROUP_TO_VEC(double4, d, 4) -__AMDGCN_CLC_SUBGROUP_TO_VEC(double8, d, 8) -__AMDGCN_CLC_SUBGROUP_TO_VEC(double16, d, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double2, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double4, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double8, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_TO_VEC // Shuffle XOR -// int __spirv_SubgroupShuffleXorINTEL(int, unsigned int) -_CLC_DEF int -_Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(int Data, - unsigned int InvocationId) { +_CLC_OVERLOAD _CLC_DEF int +__spirv_SubgroupShuffleXorINTEL(int Data, unsigned int InvocationId) { int self = SELF; unsigned int index = self ^ InvocationId; index = @@ -160,27 +144,20 @@ _Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(int Data, } // Sub 32-bit types. -// _Z31__spirv_SubgroupShuffleXorINTELIaET_S0_j - char -// _Z31__spirv_SubgroupShuffleXorINTELIhET_S0_j - unsigned char -// _Z31__spirv_SubgroupShuffleXorINTELIsET_S0_j - short -// _Z31__spirv_SubgroupShuffleXorINTELItET_S0_j - unsigned short -// _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j - half -#define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - return _Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(Data, InvocationId); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE \ + __spirv_SubgroupShuffleXor(TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL((int)Data, InvocationId); \ } -__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(char, a); -__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned char, h); -__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(short, s); -__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned short, t); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(char); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned char); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(short); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned short); #ifdef cl_khr_fp16 -_CLC_DEF half _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j( +_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleXorINTEL( half Data, unsigned int InvocationId) { unsigned short tmp = as_ushort(Data); - tmp = (unsigned short)_Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j( - (int)tmp, InvocationId); + tmp = (unsigned short)__spirv_SubgroupShuffleXorINTEL(tmp, InvocationId); return as_half(tmp); } #endif // cl_khr_fp16 @@ -189,111 +166,99 @@ _CLC_DEF half _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j( // 32-bit types. // __spirv_SubgroupShuffleXorINTEL - unsigned int // __spirv_SubgroupShuffleXorINTEL - float -#define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - return __builtin_astype(_Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j( \ - as_int(Data), InvocationId), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE \ + __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ + return __builtin_astype(__spirv_SubgroupShuffleXorINTEL( \ + as_int(Data), InvocationId), \ + CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_XOR_I32(unsigned int, uint, j); -__AMDGCN_CLC_SUBGROUP_XOR_I32(float, float, f); +__AMDGCN_CLC_SUBGROUP_XOR_I32(unsigned int, uint); +__AMDGCN_CLC_SUBGROUP_XOR_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_XOR_I32 // 64-bit types. -// __spirv_SubgroupShuffleXorINTEL - long -// __spirv_SubgroupShuffleXorINTEL - unsigned long -// __spirv_SubgroupShuffleXorINTEL - double -#define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ - TYPE Data, unsigned int InvocationId) { \ - int2 tmp = as_int2(Data); \ - tmp.lo = \ - _Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(tmp.lo, InvocationId); \ - tmp.hi = \ - _Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(tmp.hi, InvocationId); \ - return __builtin_astype(tmp, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE \ + __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ + int2 tmp = as_int2(Data); \ + tmp.lo = __spirv_SubgroupShuffleXorINTEL(tmp.lo, InvocationId); \ + tmp.hi = __spirv_SubgroupShuffleXorINTEL(tmp.hi, InvocationId); \ + return __builtin_astype(tmp, CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_XOR_I64(long, long, l); -__AMDGCN_CLC_SUBGROUP_XOR_I64(unsigned long, ulong, m); -__AMDGCN_CLC_SUBGROUP_XOR_I64(double, double, d); +__AMDGCN_CLC_SUBGROUP_XOR_I64(long, long); +__AMDGCN_CLC_SUBGROUP_XOR_I64(unsigned long, ulong); +__AMDGCN_CLC_SUBGROUP_XOR_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_XOR_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ - _CLC_DEF TYPE \ - _Z31__spirv_SubgroupShuffleXorINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_j( \ - TYPE Data, unsigned int InvocationId) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = \ - _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_SCALAR_TY##ET_S0_j( \ - Data[i], InvocationId); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE \ + __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleXorINTEL(Data[i], InvocationId); \ + } \ + return res; \ } // [u]char -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char2, a, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char4, a, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char8, a, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char16, a, 16) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar2, h, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar4, h, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar8, h, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar16, h, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char16, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar16, 16) // [u]short -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short2, s, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short4, s, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short8, s, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short16, s, 16) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort2, t, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort4, t, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort8, t, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort16, t, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short16, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort16, 16) // [u]int -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int2, i, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int4, i, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int8, i, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int16, i, 16) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint2, j, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint4, j, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint8, j, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint16, j, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int16, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint16, 16) // [u]long -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long2, l, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long4, l, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long8, l, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long16, l, 16) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong2, m, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong4, m, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong8, m, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong16, m, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long16, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong16, 16) // float -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float2, f, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float4, f, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float8, f, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float16, f, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float16, 16) // half #ifdef cl_khr_fp16 -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half2, DF16_, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half4, DF16_, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half8, DF16_, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half16, DF16_, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half16, 16) #endif // cl_khr_fp16 // double -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double2, d, 2) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double4, d, 4) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double8, d, 8) -__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, d, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double2, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double4, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double8, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC // Shuffle Up -// int __spirv_SubgroupShuffleUpINTEL(int, int, unsigned int) -_CLC_DEF int -_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current, - unsigned int delta) { +_CLC_OVERLOAD _CLC_DEF int +__spirv_SubgroupShuffleUpINTEL(int previous, int current, unsigned int delta) { int self = SELF; int size = SUBGROUP_SIZE; @@ -315,145 +280,125 @@ _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current, } // Sub 32-bit types. -// _Z30__spirv_SubgroupShuffleUpINTELIaET_S0_S0_j - char -// _Z30__spirv_SubgroupShuffleUpINTELIhET_S0_S0_j - unsigned char -// _Z30__spirv_SubgroupShuffleUpINTELIsET_S0_S0_j - short -// _Z30__spirv_SubgroupShuffleUpINTELItET_S0_S0_j - unsigned short -// _Z30__spirv_SubgroupShuffleUpINTELIDF16_ET_S0_S0_j - half -#define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE previous, TYPE current, unsigned int delta) { \ - return _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(previous, current, \ - delta); \ +#define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __spirv_SubgroupShuffleUpINTEL((int)previous, (int)current, delta); \ } -__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char, a); -__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char, h); -__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(short, s); -__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short, t); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(short); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short); // half #ifdef cl_khr_fp16 -_CLC_DEF half _Z30__spirv_SubgroupShuffleUpINTELIDF16_ET_S0_S0_j( +_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleUpINTEL( half previous, half current, unsigned int delta) { unsigned short tmpP = as_ushort(previous); unsigned short tmpC = as_ushort(current); - tmpC = (unsigned short)_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( - (int)tmpP, (int)tmpC, delta); + tmpC = __spirv_SubgroupShuffleUpINTEL(tmpP, tmpC, delta); return as_half(tmpC); } #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_UP_SUB_I32 // 32-bit types. -// __spirv_SubgroupShuffleUpINTELi - unsigned int -// __spirv_SubgroupShuffleUpINTELi - float -#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE previous, TYPE current, unsigned int delta) { \ - return __builtin_astype(_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - as_int(previous), as_int(current), delta), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __builtin_astype(__spirv_SubgroupShuffleUpINTEL( \ + as_int(previous), as_int(current), delta), \ + CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint, j); -__AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f); +__AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint); +__AMDGCN_CLC_SUBGROUP_UP_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_UP_I32 // 64-bit types. -// __spirv_SubgroupShuffleUpINTEL - long -// __spirv_SubgroupShuffleUpINTEL - unsigned long -// __spirv_SubgroupShuffleUpINTEL - double -#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE previous, TYPE current, unsigned int delta) { \ - int2 tmp_previous = as_int2(previous); \ - int2 tmp_current = as_int2(current); \ - int2 ret; \ - ret.lo = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - tmp_previous.lo, tmp_current.lo, delta); \ - ret.hi = _Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( \ - tmp_previous.hi, tmp_current.hi, delta); \ - return __builtin_astype(ret, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + int2 tmp_previous = as_int2(previous); \ + int2 tmp_current = as_int2(current); \ + int2 ret; \ + ret.lo = __spirv_SubgroupShuffleUpINTEL( \ + tmp_previous.lo, tmp_current.lo, delta); \ + ret.hi = __spirv_SubgroupShuffleUpINTEL( \ + tmp_previous.hi, tmp_current.hi, delta); \ + return __builtin_astype(ret, CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_UP_I64(long, long, l); -__AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong, m); -__AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d); +__AMDGCN_CLC_SUBGROUP_UP_I64(long, long); +__AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong); +__AMDGCN_CLC_SUBGROUP_UP_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_UP_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ - _CLC_DEF TYPE \ - _Z30__spirv_SubgroupShuffleUpINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ - TYPE previous, TYPE current, unsigned int delta) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = \ - _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \ - previous[i], current[i], delta); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleUpINTEL(previous[i], current[i], delta); \ + } \ + return res; \ } // [u]char -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char2, a, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char4, a, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char8, a, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char16, a, 16) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar2, h, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar4, h, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar8, h, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar16, h, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char16, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar16, 16) // [u]short -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short2, s, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short4, s, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short8, s, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short16, s, 16) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort2, t, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort4, t, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort8, t, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort16, t, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short16, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort16, 16) // [u]int -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int2, i, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int4, i, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int8, i, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int16, i, 16) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint2, j, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint4, j, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint8, j, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint16, j, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int16, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint16, 16) // [u]long -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long2, l, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long4, l, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long8, l, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long16, l, 16) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong2, m, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong4, m, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong8, m, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong16, m, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long16, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong16, 16) // half #ifdef cl_khr_fp16 -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half2, DF16_, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half4, DF16_, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half8, DF16_, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half16, DF16_, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half16, 16) #endif // cl_khr_fp16 // float -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float2, f, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float4, f, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float8, f, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float16, f, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float16, 16) // double -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double2, d, 2) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double4, d, 4) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double8, d, 8) -__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, d, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double2, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double4, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double8, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_UP_TO_VEC // Shuffle Down -// int __spirv_SubgroupShuffleDownINTEL(int, int, unsigned int) -_CLC_DEF int -_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next, - unsigned int delta) { +_CLC_OVERLOAD _CLC_DEF int +__spirv_SubgroupShuffleDownINTEL(int current, int next, unsigned int delta) { int self = SELF; int size = SUBGROUP_SIZE; @@ -475,30 +420,22 @@ _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next, } // Sub 32-bit types. -// _Z32__spirv_SubgroupShuffleDownINTELIaET_S0_S0_j - char -// _Z32__spirv_SubgroupShuffleDownINTELIhET_S0_S0_j - unsigned char -// _Z32__spirv_SubgroupShuffleDownINTELIsET_S0_S0_j - short -// _Z32__spirv_SubgroupShuffleDownINTELItET_S0_S0_j - unsigned short -// _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j - half -#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE current, TYPE next, unsigned int delta) { \ - return _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(current, next, \ - delta); \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL((int)current, (int)next, delta); \ } -__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char, a); -__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char, h); -__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(short, s); -__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(short); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short); // half #ifdef cl_khr_fp16 -_CLC_DEF half _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j( +_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleDownINTEL( half current, half next, unsigned int delta) { unsigned short tmpC = as_ushort(current); unsigned short tmpN = as_ushort(next); - tmpC = (unsigned short)_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( - (int)tmpC, (int)tmpN, delta); + tmpC = __spirv_SubgroupShuffleDownINTEL(tmpC, tmpN, delta); return as_half(tmpC); } #endif // cl_khr_fp16 @@ -507,102 +444,96 @@ _CLC_DEF half _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j( // 32-bit types. // __spirv_SubgroupShuffleDownINTEL - unsigned int // __spirv_SubgroupShuffleDownINTEL - float -#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ - TYPE current, TYPE next, unsigned int delta) { \ - return __builtin_astype(_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ - as_int(current), as_int(next), delta), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __builtin_astype(__spirv_SubgroupShuffleDownINTEL( \ + as_int(current), as_int(next), delta), \ + CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint, j); -__AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f); +__AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint); +__AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_DOWN_I32 // 64-bit types. -// double __spirv_SubgroupShuffleDownINTEL(double, unsigned int, int) -#define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ - _CLC_DEF TYPE \ - _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ TYPE current, TYPE next, unsigned int delta) { \ int2 tmp_current = as_int2(current); \ int2 tmp_next = as_int2(next); \ int2 ret; \ - ret.lo = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ + ret.lo = __spirv_SubgroupShuffleDownINTEL( \ tmp_current.lo, tmp_next.lo, delta); \ - ret.hi = _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j( \ + ret.hi = __spirv_SubgroupShuffleDownINTEL( \ tmp_current.hi, tmp_next.hi, delta); \ return __builtin_astype(ret, CAST_TYPE); \ } -__AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long, l); -__AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong, m); -__AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d); +__AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long); +__AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong); +__AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_DOWN_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ - _CLC_DEF TYPE \ - _Z32__spirv_SubgroupShuffleDownINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ - TYPE current, TYPE next, unsigned int delta) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = \ - _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_SCALAR_TY##ET_S0_S0_j( \ - current[i], next[i], delta); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleDownINTEL(current[i], next[i], delta); \ + } \ + return res; \ } // [u]char -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char2, a, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char4, a, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char8, a, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char16, a, 16) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar2, h, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar4, h, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar8, h, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar16, h, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char16, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar16, 16) // [u]short -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short2, s, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short4, s, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short8, s, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short16, s, 16) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort2, t, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort4, t, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort8, t, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort16, t, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short16, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort16, 16) // [u]int -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int2, i, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int4, i, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int8, i, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int16, i, 16) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint2, j, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint4, j, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint8, j, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint16, j, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int16, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint16, 16) // [u]long -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long2, l, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long4, l, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long8, l, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long16, l, 16) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong2, m, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong4, m, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong8, m, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong16, m, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long16, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong16, 16) // half #ifdef cl_khr_fp16 -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half2, DF16_, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half4, DF16_, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half8, DF16_, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half16, DF16_, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half16, 16) #endif // cl_khr_fp16 // float -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float2, f, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float4, f, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float8, f, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float16, f, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float16, 16) // double -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double2, d, 2) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double4, d, 4) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double8, d, 8) -__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double16, d, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double2, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double4, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double8, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC diff --git a/libdevice/nativecpu_utils.cpp b/libdevice/nativecpu_utils.cpp index ca20725a46917..5c374b5003155 100644 --- a/libdevice/nativecpu_utils.cpp +++ b/libdevice/nativecpu_utils.cpp @@ -73,10 +73,8 @@ DefGenericCastToPtrExpl(ToGlobal, OCL_GLOBAL); __spirv_SubgroupBlockReadINTEL(const OCL_GLOBAL PType *Ptr) noexcept { \ return Ptr[__spirv_SubgroupLocalInvocationId()]; \ } \ - template <> \ - __SYCL_CONVERGENT__ DEVICE_EXTERNAL void \ - __spirv_SubgroupBlockWriteINTEL(PType OCL_GLOBAL * ptr, \ - Type v) noexcept { \ + __SYCL_CONVERGENT__ DEVICE_EXTERNAL void __spirv_SubgroupBlockWriteINTEL( \ + PType OCL_GLOBAL *ptr, Type v) noexcept { \ ((Type *)ptr)[__spirv_SubgroupLocalInvocationId()] = v; \ } \ static_assert(true) @@ -252,9 +250,8 @@ DefineBroadCast(int64_t, i64, int64_t); #define DefShuffleINTEL(Type, Sfx, MuxType) \ DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_##Sfx(MuxType val, \ int32_t lid) noexcept; \ - template <> \ - DEVICE_EXTERNAL Type __spirv_SubgroupShuffleINTEL( \ - Type val, unsigned id) noexcept { \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleINTEL(Type val, \ + unsigned id) noexcept { \ return (Type)__mux_sub_group_shuffle_##Sfx((MuxType)val, id); \ } \ static_assert(true) @@ -262,8 +259,7 @@ DefineBroadCast(int64_t, i64, int64_t); #define DefShuffleUpINTEL(Type, Sfx, MuxType) \ DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_up_##Sfx( \ MuxType prev, MuxType curr, int32_t delta) noexcept; \ - template <> \ - DEVICE_EXTERNAL Type __spirv_SubgroupShuffleUpINTEL( \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleUpINTEL( \ Type prev, Type curr, unsigned delta) noexcept { \ return (Type)__mux_sub_group_shuffle_up_##Sfx((MuxType)prev, \ (MuxType)curr, delta); \ @@ -273,8 +269,7 @@ DefineBroadCast(int64_t, i64, int64_t); #define DefShuffleDownINTEL(Type, Sfx, MuxType) \ DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_down_##Sfx( \ MuxType curr, MuxType next, int32_t delta) noexcept; \ - template <> \ - DEVICE_EXTERNAL Type __spirv_SubgroupShuffleDownINTEL( \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleDownINTEL( \ Type curr, Type next, unsigned delta) noexcept { \ return (Type)__mux_sub_group_shuffle_down_##Sfx((MuxType)curr, \ (MuxType)next, delta); \ @@ -284,8 +279,7 @@ DefineBroadCast(int64_t, i64, int64_t); #define DefShuffleXorINTEL(Type, Sfx, MuxType) \ DEVICE_EXTERN_C MuxType __mux_sub_group_shuffle_xor_##Sfx(MuxType val, \ int32_t xor_val); \ - template <> \ - DEVICE_EXTERNAL Type __spirv_SubgroupShuffleXorINTEL( \ + DEVICE_EXTERNAL Type __spirv_SubgroupShuffleXorINTEL( \ Type data, unsigned value) noexcept { \ return (Type)__mux_sub_group_shuffle_xor_##Sfx((MuxType)data, value); \ } \ diff --git a/llvm/lib/SYCLNativeCPUUtils/FixABIMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/FixABIMuxBuiltinsSYCLNativeCPU.cpp index b3ff7372b8d0f..efe2bef1b9ecd 100644 --- a/llvm/lib/SYCLNativeCPUUtils/FixABIMuxBuiltinsSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/FixABIMuxBuiltinsSYCLNativeCPU.cpp @@ -37,25 +37,25 @@ PreservedAnalyses FixABIMuxBuiltinsPass::run(Module &M, return false; } Updates.clear(); - auto LIDvPos = F.getName().find("ELIDv"); + auto LIDvPos = F.getName().find("ELDv"); llvm::StringRef NameToMatch; if (LIDvPos != llvm::StringRef::npos) { // Add sizeof ELIDv to get num characters to match against - NameToMatch = F.getName().take_front(LIDvPos + 5); + NameToMatch = F.getName().take_front(LIDvPos + 4); } else { return false; } unsigned int StartIdx = 0; unsigned int EndIdx = 1; - if (NameToMatch == "_Z32__spirv_SubgroupShuffleDownINTELIDv") { + if (NameToMatch == "_Z32__spirv_SubgroupShuffleDownINTELDv") { MuxFuncNameToCall = "__mux_sub_group_shuffle_down_"; - } else if (NameToMatch == "_Z30__spirv_SubgroupShuffleUpINTELIDv") { + } else if (NameToMatch == "_Z30__spirv_SubgroupShuffleUpINTELDv") { MuxFuncNameToCall = "__mux_sub_group_shuffle_up_"; - } else if (NameToMatch == "_Z28__spirv_SubgroupShuffleINTELIDv") { + } else if (NameToMatch == "_Z28__spirv_SubgroupShuffleINTELDv") { MuxFuncNameToCall = "__mux_sub_group_shuffle_"; EndIdx = 0; - } else if (NameToMatch == "_Z31__spirv_SubgroupShuffleXorINTELIDv") { + } else if (NameToMatch == "_Z31__spirv_SubgroupShuffleXorINTELDv") { MuxFuncNameToCall = "__mux_sub_group_shuffle_xor_"; EndIdx = 0; } else { diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index 4003b02485b87..4d99badf8d475 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -395,101 +395,46 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max) #undef __SPIRV_ATOMIC_UNSIGNED #undef __SPIRV_ATOMIC_XOR -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT -__spirv_SubgroupShuffleINTEL(dataT Data, uint32_t InvocationId) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT -__spirv_SubgroupShuffleDownINTEL(dataT Current, dataT Next, - uint32_t Delta) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT -__spirv_SubgroupShuffleUpINTEL(dataT Previous, dataT Current, - uint32_t Delta) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT -__spirv_SubgroupShuffleXorINTEL(dataT Data, uint32_t Value) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint8_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint8_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint16_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint16_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint32_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint32_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint64_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint64_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) uint8_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint8_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) uint16_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint16_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) uint32_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint32_t *Ptr, - dataT Data) noexcept; - template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT __spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) uint64_t *Ptr) noexcept; -template -__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void -__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint64_t *Ptr, - dataT Data) noexcept; - template extern __DPCPP_SYCL_EXTERNAL sycl::detail::ap_int __spirv_FixedSqrtINTEL(sycl::detail::ap_int a, bool S, int32_t I, int32_t rI, diff --git a/sycl/test/check_device_code/group_load_store_alignment.cpp b/sycl/test/check_device_code/group_load_store_alignment.cpp index b651fb3b8f802..1de28486460d4 100644 --- a/sycl/test/check_device_code/group_load_store_alignment.cpp +++ b/sycl/test/check_device_code/group_load_store_alignment.cpp @@ -82,7 +82,7 @@ SYCL_EXTERNAL void test_load_with_alignment_hint(sycl::sub_group &sg, // CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] // CHECK-GLOBAL: if.end.i.i: -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] // CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: // CHECK-GLOBAL-NEXT: ret void @@ -96,7 +96,7 @@ SYCL_EXTERNAL void test_store_without_alignment_hint(sycl::sub_group &sg, int v, // CHECK-GLOBAL-NEXT: entry: // CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null // CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V:%.*]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V:%.*]]) #[[ATTR3]] // CHECK-GLOBAL-NEXT: ret void // SYCL_EXTERNAL void test_store_with_alignment_hint(sycl::sub_group &sg, int v, diff --git a/sycl/test/check_device_code/group_load_store_native_key.cpp b/sycl/test/check_device_code/group_load_store_native_key.cpp index f48601ab8c215..a30b89616cd1a 100644 --- a/sycl/test/check_device_code/group_load_store_native_key.cpp +++ b/sycl/test/check_device_code/group_load_store_native_key.cpp @@ -109,7 +109,7 @@ SYCL_EXTERNAL void test_load_native(sycl::sub_group &sg, plain_ptr p, // CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] // CHECK-GLOBAL: if.end.i.i: -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] // CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: // CHECK-GLOBAL-NEXT: ret void @@ -147,7 +147,7 @@ SYCL_EXTERNAL void test_store(sycl::sub_group &sg, int v, plain_ptr p) { // CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] // CHECK-GLOBAL: if.end.i.i: -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] // CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: // CHECK-GLOBAL-NEXT: ret void @@ -169,7 +169,7 @@ SYCL_EXTERNAL void test_store(sycl::sub_group &sg, int v, plain_ptr p) { // CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] // CHECK-LOCAL: if.end.i.i: -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jj(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] // CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: // CHECK-LOCAL-NEXT: ret void diff --git a/sycl/test/check_device_code/group_shuffle.cpp b/sycl/test/check_device_code/group_shuffle.cpp index 9bd01bf1e2b9f..fbd875b7a0694 100644 --- a/sycl/test/check_device_code/group_shuffle.cpp +++ b/sycl/test/check_device_code/group_shuffle.cpp @@ -27,7 +27,7 @@ using namespace sycl::ext::oneapi; // CHECK-NEXT: [[CONV_I_I_I:%.*]] = zext nneg i32 [[S_0_I_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[AGG_TMP14_I]], i64 0, i64 [[CONV_I_I_I]] // CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_I_I_I_I_I]], align 2, !tbaa [[TBAA14:![0-9]+]], !noalias [[META18:![0-9]+]] -// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELItET_S0_j(i16 noundef zeroext [[TMP1]], i32 noundef 1) #[[ATTR6:[0-9]+]], !noalias [[META19:![0-9]+]] +// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELtj(i16 noundef zeroext [[TMP1]], i32 noundef 1) #[[ATTR6:[0-9]+]], !noalias [[META19:![0-9]+]] // CHECK-NEXT: [[ARRAYIDX_I_I_I12_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[REF_TMP]], i64 0, i64 [[CONV_I_I_I]] // CHECK-NEXT: store i16 [[CALL4_I_I_I_I]], ptr [[ARRAYIDX_I_I_I12_I_I]], align 2, !tbaa [[TBAA14]], !alias.scope [[META18]] // CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[S_0_I_I]], 1 @@ -72,7 +72,7 @@ SYCL_EXTERNAL void test_shuffle1(sycl::sub_group &sg, vec *buf, // CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[S_0_I_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[AGG_TMP14_I]], i64 0, i64 [[CONV_I_I]] // CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA14]], !noalias [[META32]] -// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELItET_S0_j(i16 noundef zeroext [[TMP1]], i32 noundef 1) #[[ATTR6]], !noalias [[META33:![0-9]+]] +// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELtj(i16 noundef zeroext [[TMP1]], i32 noundef 1) #[[ATTR6]], !noalias [[META33:![0-9]+]] // CHECK-NEXT: [[ARRAYIDX_I13_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[REF_TMP]], i64 0, i64 [[CONV_I_I]] // CHECK-NEXT: store i16 [[CALL4_I_I_I_I]], ptr [[ARRAYIDX_I13_I_I]], align 2, !tbaa [[TBAA14]], !alias.scope [[META32]] // CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[S_0_I_I]], 1 @@ -116,7 +116,7 @@ SYCL_EXTERNAL void test_shuffle2(sycl::sub_group &sg, marray *buf, // CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[S_0_I_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds [5 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[AGG_TMP14_I]], i64 0, i64 [[CONV_I_I]] // CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA14]], !noalias [[META44]] -// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELItET_S0_j(i16 noundef zeroext [[TMP0]], i32 noundef 1) #[[ATTR6]], !noalias [[META45:![0-9]+]] +// CHECK-NEXT: [[CALL4_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @_Z28__spirv_SubgroupShuffleINTELtj(i16 noundef zeroext [[TMP0]], i32 noundef 1) #[[ATTR6]], !noalias [[META45:![0-9]+]] // CHECK-NEXT: [[ARRAYIDX_I13_I_I:%.*]] = getelementptr inbounds [5 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr [[REF_TMP]], i64 0, i64 [[CONV_I_I]] // CHECK-NEXT: store i16 [[CALL4_I_I_I_I]], ptr [[ARRAYIDX_I13_I_I]], align 2, !tbaa [[TBAA14]], !alias.scope [[META44]] // CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[S_0_I_I]], 1 @@ -138,7 +138,7 @@ SYCL_EXTERNAL void test_shuffle3(sycl::sub_group &sg, marray *buf, // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr addrspace(4), ptr addrspace(4) [[BUF:%.*]], i64 [[ID:%.*]] // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX]], align 8, !tbaa [[TBAA51:![0-9]+]] // CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(4) [[TMP0]] to i64 -// CHECK-NEXT: [[CALL4_I_I_I:%.*]] = tail call spir_func noundef i64 @_Z28__spirv_SubgroupShuffleINTELImET_S0_j(i64 noundef [[TMP1]], i32 noundef 1) #[[ATTR6]] +// CHECK-NEXT: [[CALL4_I_I_I:%.*]] = tail call spir_func noundef i64 @_Z28__spirv_SubgroupShuffleINTELmj(i64 noundef [[TMP1]], i32 noundef 1) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[CALL4_I_I_I]] to ptr addrspace(4) // CHECK-NEXT: store ptr addrspace(4) [[TMP2]], ptr addrspace(4) [[ARRAYIDX]], align 8, !tbaa [[TBAA51]] // CHECK-NEXT: ret void @@ -153,7 +153,7 @@ SYCL_EXTERNAL void test_shuffle4(sycl::sub_group &sg, int **buf, size_t id) { // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr addrspace(4), ptr addrspace(4) [[BUF:%.*]], i64 [[ID:%.*]] // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX]], align 8, !tbaa [[TBAA51]] // CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(4) [[TMP0]] to i64 -// CHECK-NEXT: [[CALL4_I_I_I:%.*]] = tail call spir_func noundef i64 @_Z28__spirv_SubgroupShuffleINTELImET_S0_j(i64 noundef [[TMP1]], i32 noundef 1) #[[ATTR6]] +// CHECK-NEXT: [[CALL4_I_I_I:%.*]] = tail call spir_func noundef i64 @_Z28__spirv_SubgroupShuffleINTELmj(i64 noundef [[TMP1]], i32 noundef 1) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[CALL4_I_I_I]] to ptr addrspace(4) // CHECK-NEXT: store ptr addrspace(4) [[TMP2]], ptr addrspace(4) [[ARRAYIDX]], align 8, !tbaa [[TBAA51]] // CHECK-NEXT: ret void diff --git a/sycl/test/check_device_code/group_store.cpp b/sycl/test/check_device_code/group_store.cpp index 5a358585e24c0..8a825222339ca 100644 --- a/sycl/test/check_device_code/group_store.cpp +++ b/sycl/test/check_device_code/group_store.cpp @@ -92,7 +92,7 @@ SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, int v, plain_ptr p) { // CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] // CHECK-GLOBAL: if.end.i.i: -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] // CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: // CHECK-GLOBAL-NEXT: ret void @@ -114,7 +114,7 @@ SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, int v, plain_ptr p) { // CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] // CHECK-LOCAL: if.end.i.i: -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jj(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] // CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: // CHECK-LOCAL-NEXT: ret void @@ -142,7 +142,7 @@ SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, int v, // CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] // CHECK-GLOBAL: if.end.i.i: -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jj(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] // CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: // CHECK-GLOBAL-NEXT: ret void @@ -164,7 +164,7 @@ SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, int v, // CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] // CHECK-LOCAL: if.end.i.i: -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jj(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] // CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: // CHECK-LOCAL-NEXT: ret void @@ -306,7 +306,7 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34:![0-9]+]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1mm(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -362,7 +362,7 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34:![0-9]+]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS3mT_(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3mm(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-LOCAL: for.body.i.i: @@ -425,7 +425,7 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1mm(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -481,7 +481,7 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS3mT_(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3mm(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT]] // CHECK-LOCAL: for.body.i.i: @@ -782,7 +782,7 @@ SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, span v, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I_I]], align 4, !tbaa [[TBAA34]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jDv2_j(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -839,7 +839,7 @@ SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, span v, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I_I]], align 4, !tbaa [[TBAA34]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jDv2_j(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-LOCAL: for.body.i.i: @@ -902,7 +902,7 @@ SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, span v, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I_I]], align 4, !tbaa [[TBAA34]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1jDv2_j(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -959,7 +959,7 @@ SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, span v, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I_I]], align 4, !tbaa [[TBAA34]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3jDv2_j(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] // CHECK-LOCAL: for.body.i.i: @@ -1142,7 +1142,7 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv4_t(ptr addrspace(1) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -1199,7 +1199,7 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS3tT_(ptr addrspace(3) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv4_t(ptr addrspace(3) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-LOCAL: for.body.i.i: @@ -1263,7 +1263,7 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, // CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-GLOBAL: for.cond.cleanup.i.i: // CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS1tDv16_t(ptr addrspace(1) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] // CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-GLOBAL: for.body.i.i: @@ -1320,7 +1320,7 @@ SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, // CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[FOR_COND_CLEANUP_I_I:%.*]] // CHECK-LOCAL: for.cond.cleanup.i.i: // CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I_I]], align 2, !tbaa [[TBAA34]] -// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS3tT_(ptr addrspace(3) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELPU3AS3tDv16_t(ptr addrspace(3) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] // CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I_I]]) #[[ATTR9]] // CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] // CHECK-LOCAL: for.body.i.i: diff --git a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp index 8a94745f08100..0f187d07639dc 100644 --- a/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp +++ b/sycl/test/check_device_code/native_cpu/shuffle_abi.cpp @@ -18,33 +18,33 @@ static constexpr size_t NumElems = VEC_WIDTH; static constexpr size_t NumWorkItems = 64; -// UP_V2_INT: double @_Z30__spirv_SubgroupShuffleUpINTELIDv2_iET_S1_S1_j(double noundef %[[ARG0:[0-9]+]], double noundef %[[ARG1:[0-9]+]] +// UP_V2_INT: double @_Z30__spirv_SubgroupShuffleUpINTELDv2_iS_j(double noundef %[[ARG0:[0-9]+]], double noundef %[[ARG1:[0-9]+]] // UP_V2_INT: %[[UPV2I32_BITCAST_OP0:[0-9]+]] = bitcast double %[[ARG0]] to <2 x i32> // UP_V2_INT: %[[UPV2I32_BITCAST_OP1:[0-9]+]] = bitcast double %[[ARG1]] to <2 x i32> // UP_V2_INT: %[[UPV2I32_CALL_SHUFFLE:[0-9]+]] = call <2 x i32> @__mux_sub_group_shuffle_up_v2i32(<2 x i32> %[[UPV2I32_BITCAST_OP0]], <2 x i32> %[[UPV2I32_BITCAST_OP1]] // UP_V2_INT: %[[UPV2I32_BITCAST_RESULT:[0-9]+]] = bitcast <2 x i32> %[[UPV2I32_CALL_SHUFFLE]] to double // UP_V2_INT: ret double %[[UPV2I32_BITCAST_RESULT]] -// DOWN_V4_SHORT: double @_Z32__spirv_SubgroupShuffleDownINTELIDv4_sET_S1_S1_j(double noundef %[[ARG0:[0-9]+]], double noundef %[[ARG1:[0-9]+]] +// DOWN_V4_SHORT: double @_Z32__spirv_SubgroupShuffleDownINTELDv4_sS_j(double noundef %[[ARG0:[0-9]+]], double noundef %[[ARG1:[0-9]+]] // DOWN_V4_SHORT: %[[DOWNV4I16_BITCAST_OP0:[0-9]+]] = bitcast double %[[ARG0]] to <4 x i16> // DOWN_V4_SHORT: %[[DOWNV4I16_BITCAST_OP1:[0-9]+]] = bitcast double %[[ARG1]] to <4 x i16> // DOWN_V4_SHORT: %[[DOWNV4I16_CALL_SHUFFLE:[0-9]+]] = call <4 x i16> @__mux_sub_group_shuffle_down_v4i16(<4 x i16> %[[DOWNV4I16_BITCAST_OP0]], <4 x i16> %[[DOWNV4I16_BITCAST_OP1]] // DOWN_V4_SHORT: %[[DOWNV4I16_BITCAST_RESULT:[0-9]+]] = bitcast <4 x i16> %[[DOWNV4I16_CALL_SHUFFLE]] to double // DOWN_V4_SHORT: ret double %[[DOWNV4I16_BITCAST_RESULT]] -// XOR_V4_CHAR: i32 @_Z31__spirv_SubgroupShuffleXorINTELIDv4_aET_S1_j(i32 noundef %[[ARG0:[0-9]+]], i32 +// XOR_V4_CHAR: i32 @_Z31__spirv_SubgroupShuffleXorINTELDv4_aj(i32 noundef %[[ARG0:[0-9]+]], i32 // XOR_V4_CHAR: %[[XORV4I8_BITCAST_OP0:[0-9]+]] = bitcast i32 %[[ARG0]] to <4 x i8> // XOR_V4_CHAR: %[[XORV4I8_CALL_SHUFFLE:[0-9]+]] = call <4 x i8> @__mux_sub_group_shuffle_xor_v4i8(<4 x i8> %[[XORV4I8_BITCAST_OP0]], i32 // XOR_V4_CHAR: %[[XORV4I8_BITCAST_RESULT:[0-9]+]] = bitcast <4 x i8> %[[XORV4I8_CALL_SHUFFLE]] to i32 // XOR_V4_CHAR: ret i32 %[[XORV4I8_BITCAST_RESULT]] -// UP_V8_FLOAT: <8 x float> @_Z30__spirv_SubgroupShuffleUpINTELIDv8_fET_S1_S1_j(ptr noundef byval(<8 x float>) align 32 %[[ARG0:[0-9]+]], ptr noundef byval(<8 x float>) align 32 %[[ARG1:[0-9]+]] +// UP_V8_FLOAT: <8 x float> @_Z30__spirv_SubgroupShuffleUpINTELDv8_fS_j(ptr noundef byval(<8 x float>) align 32 %[[ARG0:[0-9]+]], ptr noundef byval(<8 x float>) align 32 %[[ARG1:[0-9]+]] // UP_V8_FLOAT: %[[UPV8F32_BYVAL_LOAD_OP0:[0-9]+]] = load <8 x float>, ptr %[[ARG0]], align 32 // UP_V8_FLOAT: %[[UPV8F32_BYVAL_LOAD_OP1:[0-9]+]] = load <8 x float>, ptr %[[ARG1]], align 32 // UP_V8_FLOAT: %[[UPV8F32_CALL_SHUFFLE:[0-9]+]] = call <8 x float> @__mux_sub_group_shuffle_up_v8f32(<8 x float> %[[UPV8F32_BYVAL_LOAD_OP0]], <8 x float> %[[UPV8F32_BYVAL_LOAD_OP1]], i32 // UP_V8_FLOAT: ret <8 x float> %[[UPV8F32_CALL_SHUFFLE:[0-9]+]] -// SELECT_V8_SELECT_I32: <8 x i32> @_Z28__spirv_SubgroupShuffleINTELIDv8_jET_S1_j(ptr noundef byval(<8 x i32>) align 32 %[[ARG0:[0-9]+]], +// SELECT_V8_SELECT_I32: <8 x i32> @_Z28__spirv_SubgroupShuffleINTELDv8_jj(ptr noundef byval(<8 x i32>) align 32 %[[ARG0:[0-9]+]], // SELECT_V8_SELECT_I32: %[[SELV8I32_BYVAL_LOAD_OP0:[0-9]+]] = load <8 x i32>, ptr %[[ARG0]], align 32 // SELECT_V8_SELECT_I32: %[[SELV8I32_CALL_SHUFFLE:[0-9]+]] = call <8 x i32> @__mux_sub_group_shuffle_v8i32(<8 x i32> %[[SELV8I32_BYVAL_LOAD_OP0]], i32 // SELECT_V8_SELECT_I32: ret <8 x i32> %[[SELV8I32_CALL_SHUFFLE:[0-9]+]] From eed108a6ba3a730a35eb6e2e077534c2f3d715ec Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 31 Mar 2025 18:59:32 -0700 Subject: [PATCH 2/6] refine format --- libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl index 9862019a41b72..47bf7555166d5 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl @@ -378,8 +378,7 @@ long __clc__3d_to_linear_local_id(ulong3 id) { _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ int scope, TYPE x, ulong local_id) { \ if (scope == Subgroup) { \ - return __spirv_SubgroupShuffleINTEL( \ - x, local_id); \ + return __spirv_SubgroupShuffleINTEL(x, local_id); \ } \ bool source = (__spirv_LocalInvocationIndex() == local_id); \ __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)(); \ From c750ebf38d115caaf7702cf5b52a8d016b7d0ac7 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 31 Mar 2025 18:59:45 -0700 Subject: [PATCH 3/6] remove SubgroupShuffle*INTEL declarations from libspirv --- .../lib/amdgcn-amdhsa/group/collectives.cl | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl index 47bf7555166d5..5ff09386efab3 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl @@ -38,26 +38,6 @@ __clc__get_group_scratch_float() __asm("__clc__get_group_scratch_float"); __local double * __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double"); -#define __CLC_DECLARE_SHUFFLES(TYPE) \ - _CLC_OVERLOAD _CLC_DECL TYPE __spirv_SubgroupShuffleINTEL( \ - TYPE, unsigned int); \ - _CLC_OVERLOAD _CLC_DECL TYPE __spirv_SubgroupShuffleUpINTEL( \ - TYPE, TYPE, unsigned int); - -__CLC_DECLARE_SHUFFLES(char); -__CLC_DECLARE_SHUFFLES(unsigned char); -__CLC_DECLARE_SHUFFLES(short); -__CLC_DECLARE_SHUFFLES(unsigned short); -__CLC_DECLARE_SHUFFLES(int); -__CLC_DECLARE_SHUFFLES(unsigned int); -__CLC_DECLARE_SHUFFLES(half); -__CLC_DECLARE_SHUFFLES(float); -__CLC_DECLARE_SHUFFLES(long); -__CLC_DECLARE_SHUFFLES(unsigned long); -__CLC_DECLARE_SHUFFLES(double); - -#undef __CLC_DECLARE_SHUFFLES - #define __CLC_APPEND(NAME, SUFFIX) NAME##SUFFIX #define __CLC_ADD(x, y) (x + y) From 9aa422ac3b997f1cf4e3ea2cc53a7e1588ff7b00 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 2 Apr 2025 03:57:00 -0700 Subject: [PATCH 4/6] update test: spir64 -> spirv64 --- clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp b/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp index 13c88e25bc988..19ef6c2530a21 100644 --- a/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp +++ b/clang/test/CodeGenSPIRV/spirv-builtin-lookup-intel-subgroup.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple=spir64 -fdeclare-spirv-builtins -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple=spirv64 -fdeclare-spirv-builtins -emit-llvm %s -o - | FileCheck %s template void test_shuffle() { T v; From 2f5b830735c85d30aa6cb61f2776ff0a3451bf7e Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 2 Apr 2025 05:11:41 -0700 Subject: [PATCH 5/6] add back old mangling implementation for backward compatibility --- .../amdgcn-amdhsa/misc/sub_group_shuffle.cl | 468 +++++++++++++++++- 1 file changed, 464 insertions(+), 4 deletions(-) diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl index 3b4be56efd2f2..bddfd67067f96 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl @@ -17,6 +17,11 @@ __spirv_SubgroupShuffleINTEL(int Data, unsigned int InvocationId) { int Index = InvocationId; return __builtin_amdgcn_ds_bpermute(Index << 2, Data); } +// int __spirv_SubgroupShuffleINTEL(int, unsigned int) +_CLC_DEF int +_Z28__spirv_SubgroupShuffleINTELIiET_S0_j(int Data, unsigned int InvocationId) { + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); +} // Sub 32-bit types. #define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE) \ @@ -28,6 +33,22 @@ __AMDGCN_CLC_SUBGROUP_SUB_I32(char); __AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned char); __AMDGCN_CLC_SUBGROUP_SUB_I32(short); __AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned short); +#undef __AMDGCN_CLC_SUBGROUP_SUB_I32 + +// _Z28__spirv_SubgroupShuffleINTELIaET_S0_j - char +// _Z28__spirv_SubgroupShuffleINTELIhET_S0_j - unsigned char +// _Z28__spirv_SubgroupShuffleINTELIsET_S0_j - long +// _Z28__spirv_SubgroupShuffleINTELItET_S0_j - unsigned long +// _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j - half +#define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_SUB_I32(char, a); +__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned char, h); +__AMDGCN_CLC_SUBGROUP_SUB_I32(short, s); +__AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned short, t); #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable @@ -37,6 +58,10 @@ _CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleINTEL( tmp = __spirv_SubgroupShuffleINTEL(tmp, InvocationId); return as_half(tmp); } +_CLC_DEF half _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j( + half Data, unsigned int InvocationId) { + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); +} #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_SUB_I32 @@ -53,6 +78,17 @@ __AMDGCN_CLC_SUBGROUP_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_I32 +// __spirv_SubgroupShuffleINTEL - unsigned int +// __spirv_SubgroupShuffleINTEL- float +#define __AMDGCN_CLC_SUBGROUP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_I32(unsigned int, uint, j); +__AMDGCN_CLC_SUBGROUP_I32(float, float, f); +#undef __AMDGCN_CLC_SUBGROUP_I32 + // 64-bit types. #define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ @@ -67,6 +103,19 @@ __AMDGCN_CLC_SUBGROUP_I64(unsigned long, ulong); __AMDGCN_CLC_SUBGROUP_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_I64 +// __spirv_SubgroupShuffleINTEL - long +// __spirv_SubgroupShuffleINTEL - unsigned long +// __spirv_SubgroupShuffleINTEL - double +#define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE _Z28__spirv_SubgroupShuffleINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_I64(long, long, l); +__AMDGCN_CLC_SUBGROUP_I64(unsigned long, ulong, m); +__AMDGCN_CLC_SUBGROUP_I64(double, double, d); +#undef __AMDGCN_CLC_SUBGROUP_I64 + // Vector types. #define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, NUM_ELEMS) \ _CLC_OVERLOAD _CLC_DEF TYPE \ @@ -133,6 +182,68 @@ __AMDGCN_CLC_SUBGROUP_TO_VEC(double8, 8) __AMDGCN_CLC_SUBGROUP_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_TO_VEC +#define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ + _CLC_DEF TYPE \ + _Z28__spirv_SubgroupShuffleINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL(Data, InvocationId); \ + } + +// [u]char +__AMDGCN_CLC_SUBGROUP_TO_VEC(char2, a, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char4, a, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char8, a, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(char16, a, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar2, h, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar4, h, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar8, h, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uchar16, h, 16) +// [u]short +__AMDGCN_CLC_SUBGROUP_TO_VEC(short2, s, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short4, s, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short8, s, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(short16, s, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort2, t, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort4, t, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort8, t, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ushort16, t, 16) +// [u]int +__AMDGCN_CLC_SUBGROUP_TO_VEC(int2, i, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int4, i, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int8, i, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(int16, i, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint2, j, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint4, j, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint8, j, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(uint16, j, 16) +// [u]long +__AMDGCN_CLC_SUBGROUP_TO_VEC(long2, l, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long4, l, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long8, l, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(long16, l, 16) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong2, m, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong4, m, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong8, m, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(ulong16, m, 16) +// half +#ifdef cl_khr_fp16 +__AMDGCN_CLC_SUBGROUP_TO_VEC(half2, DF16_, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half4, DF16_, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half8, DF16_, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(half16, DF16_, 16) +#endif // cl_khr_fp16 +// float +__AMDGCN_CLC_SUBGROUP_TO_VEC(float2, f, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float4, f, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float8, f, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(float16, f, 16) +// double +__AMDGCN_CLC_SUBGROUP_TO_VEC(double2, d, 2) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double4, d, 4) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double8, d, 8) +__AMDGCN_CLC_SUBGROUP_TO_VEC(double16, d, 16) +#undef __AMDGCN_CLC_SUBGROUP_TO_VEC + // Shuffle XOR _CLC_OVERLOAD _CLC_DEF int __spirv_SubgroupShuffleXorINTEL(int Data, unsigned int InvocationId) { @@ -142,6 +253,12 @@ __spirv_SubgroupShuffleXorINTEL(int Data, unsigned int InvocationId) { index >= ((self + SUBGROUP_SIZE) & ~(SUBGROUP_SIZE - 1)) ? self : index; return __builtin_amdgcn_ds_bpermute(index << 2, Data); } +// int __spirv_SubgroupShuffleXorINTEL(int, unsigned int) +_CLC_DEF int +_Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(int Data, + unsigned int InvocationId) { + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); +} // Sub 32-bit types. #define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE) \ @@ -163,9 +280,30 @@ _CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleXorINTEL( #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32 +// _Z31__spirv_SubgroupShuffleXorINTELIaET_S0_j - char +// _Z31__spirv_SubgroupShuffleXorINTELIhET_S0_j - unsigned char +// _Z31__spirv_SubgroupShuffleXorINTELIsET_S0_j - short +// _Z31__spirv_SubgroupShuffleXorINTELItET_S0_j - unsigned short +// _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j - half +#define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(char, a); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned char, h); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(short, s); +__AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned short, t); +#ifdef cl_khr_fp16 +_CLC_DEF half _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j( + half Data, unsigned int InvocationId) { + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); +} +#endif // cl_khr_fp16 +#undef __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32 + // 32-bit types. -// __spirv_SubgroupShuffleXorINTEL - unsigned int -// __spirv_SubgroupShuffleXorINTEL - float #define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE \ __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ @@ -177,6 +315,18 @@ __AMDGCN_CLC_SUBGROUP_XOR_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_XOR_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_XOR_I32 +// __spirv_SubgroupShuffleXorINTEL - unsigned int +// __spirv_SubgroupShuffleXorINTEL - float +#define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_XOR_I32(unsigned int, uint, j); +__AMDGCN_CLC_SUBGROUP_XOR_I32(float, float, f); +#undef __AMDGCN_CLC_SUBGROUP_XOR_I32 + // 64-bit types. #define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE \ @@ -191,7 +341,22 @@ __AMDGCN_CLC_SUBGROUP_XOR_I64(unsigned long, ulong); __AMDGCN_CLC_SUBGROUP_XOR_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_XOR_I64 +// __spirv_SubgroupShuffleXorINTEL - long +// __spirv_SubgroupShuffleXorINTEL - unsigned long +// __spirv_SubgroupShuffleXorINTEL - double +#define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z31__spirv_SubgroupShuffleXorINTELI##MANGLED_TYPE_NAME##ET_S0_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); \ + } +__AMDGCN_CLC_SUBGROUP_XOR_I64(long, long, l); +__AMDGCN_CLC_SUBGROUP_XOR_I64(unsigned long, ulong, m); +__AMDGCN_CLC_SUBGROUP_XOR_I64(double, double, d); +#undef __AMDGCN_CLC_SUBGROUP_XOR_I64 + // Vector types. + #define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, NUM_ELEMS) \ _CLC_OVERLOAD _CLC_DEF TYPE \ __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ @@ -256,6 +421,67 @@ __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double8, 8) __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC +#define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ + _CLC_DEF TYPE \ + _Z31__spirv_SubgroupShuffleXorINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_j( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL(Data, InvocationId); \ + } +// [u]char +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char2, a, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char4, a, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char8, a, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char16, a, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar2, h, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar4, h, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar8, h, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uchar16, h, 16) +// [u]short +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short2, s, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short4, s, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short8, s, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(short16, s, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort2, t, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort4, t, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort8, t, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ushort16, t, 16) +// [u]int +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int2, i, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int4, i, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int8, i, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(int16, i, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint2, j, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint4, j, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint8, j, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(uint16, j, 16) +// [u]long +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long2, l, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long4, l, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long8, l, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(long16, l, 16) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong2, m, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong4, m, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong8, m, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(ulong16, m, 16) +// float +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float2, f, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float4, f, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float8, f, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(float16, f, 16) +// half +#ifdef cl_khr_fp16 +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half2, DF16_, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half4, DF16_, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half8, DF16_, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(half16, DF16_, 16) +#endif // cl_khr_fp16 +// double +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double2, d, 2) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double4, d, 4) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double8, d, 8) +__AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(double16, d, 16) +#undef __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC + // Shuffle Up _CLC_OVERLOAD _CLC_DEF int __spirv_SubgroupShuffleUpINTEL(int previous, int current, unsigned int delta) { @@ -278,6 +504,12 @@ __spirv_SubgroupShuffleUpINTEL(int previous, int current, unsigned int delta) { return __builtin_amdgcn_ds_bpermute(index << 2, val); } +// int __spirv_SubgroupShuffleUpINTEL(int, int, unsigned int) +_CLC_DEF int +_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j(int previous, int current, + unsigned int delta) { + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); +} // Sub 32-bit types. #define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE) \ @@ -301,6 +533,34 @@ _CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleUpINTEL( #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_UP_SUB_I32 +// _Z30__spirv_SubgroupShuffleUpINTELIaET_S0_S0_j - char +// _Z30__spirv_SubgroupShuffleUpINTELIhET_S0_S0_j - unsigned char +// _Z30__spirv_SubgroupShuffleUpINTELIsET_S0_S0_j - short +// _Z30__spirv_SubgroupShuffleUpINTELItET_S0_S0_j - unsigned short +// _Z30__spirv_SubgroupShuffleUpINTELIDF16_ET_S0_S0_j - half +#define __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); \ + } +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(char, a); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned char, h); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(short, s); +__AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short, t); +// half +#ifdef cl_khr_fp16 +_CLC_DEF half _Z30__spirv_SubgroupShuffleUpINTELIDF16_ET_S0_S0_j( + half previous, half current, unsigned int delta) { + unsigned short tmpP = as_ushort(previous); + unsigned short tmpC = as_ushort(current); + tmpC = (unsigned short)_Z30__spirv_SubgroupShuffleUpINTELIiET_S0_S0_j( + (int)tmpP, (int)tmpC, delta); + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); +} +#endif // cl_khr_fp16 +#undef __AMDGCN_CLC_SUBGROUP_UP_SUB_I32 + // 32-bit types. #define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ @@ -313,6 +573,18 @@ __AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_UP_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_UP_I32 +// __spirv_SubgroupShuffleUpINTELi - unsigned int +// __spirv_SubgroupShuffleUpINTELi - float +#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); \ + } +__AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint, j); +__AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f); +#undef __AMDGCN_CLC_SUBGROUP_UP_I32 + // 64-bit types. #define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ @@ -331,6 +603,20 @@ __AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong); __AMDGCN_CLC_SUBGROUP_UP_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_UP_I64 +// __spirv_SubgroupShuffleUpINTEL - long +// __spirv_SubgroupShuffleUpINTEL - unsigned long +// __spirv_SubgroupShuffleUpINTEL - double +#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z30__spirv_SubgroupShuffleUpINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); \ + } +__AMDGCN_CLC_SUBGROUP_UP_I64(long, long, l); +__AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong, m); +__AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d); +#undef __AMDGCN_CLC_SUBGROUP_UP_I64 + // Vector types. #define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, NUM_ELEMS) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ @@ -396,6 +682,67 @@ __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double8, 8) __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_UP_TO_VEC +#define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ + _CLC_DEF TYPE \ + _Z30__spirv_SubgroupShuffleUpINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __spirv_SubgroupShuffleUpINTEL(previous, current, delta); \ + } +// [u]char +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char2, a, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char4, a, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char8, a, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(char16, a, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar2, h, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar4, h, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar8, h, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uchar16, h, 16) +// [u]short +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short2, s, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short4, s, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short8, s, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(short16, s, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort2, t, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort4, t, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort8, t, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ushort16, t, 16) +// [u]int +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int2, i, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int4, i, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int8, i, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(int16, i, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint2, j, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint4, j, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint8, j, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(uint16, j, 16) +// [u]long +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long2, l, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long4, l, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long8, l, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(long16, l, 16) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong2, m, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong4, m, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong8, m, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(ulong16, m, 16) +// half +#ifdef cl_khr_fp16 +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half2, DF16_, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half4, DF16_, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half8, DF16_, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(half16, DF16_, 16) +#endif // cl_khr_fp16 +// float +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float2, f, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float4, f, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float8, f, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(float16, f, 16) +// double +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double2, d, 2) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double4, d, 4) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double8, d, 8) +__AMDGCN_CLC_SUBGROUP_UP_TO_VEC(double16, d, 16) +#undef __AMDGCN_CLC_SUBGROUP_UP_TO_VEC + // Shuffle Down _CLC_OVERLOAD _CLC_DEF int __spirv_SubgroupShuffleDownINTEL(int current, int next, unsigned int delta) { @@ -418,6 +765,12 @@ __spirv_SubgroupShuffleDownINTEL(int current, int next, unsigned int delta) { return __builtin_amdgcn_ds_bpermute(index << 2, val); } +// int __spirv_SubgroupShuffleDownINTEL(int, int, unsigned int) +_CLC_DEF int +_Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next, + unsigned int delta) { + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); +} // Sub 32-bit types. #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE) \ @@ -441,9 +794,31 @@ _CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleDownINTEL( #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32 +// _Z32__spirv_SubgroupShuffleDownINTELIaET_S0_S0_j - char +// _Z32__spirv_SubgroupShuffleDownINTELIhET_S0_S0_j - unsigned char +// _Z32__spirv_SubgroupShuffleDownINTELIsET_S0_S0_j - short +// _Z32__spirv_SubgroupShuffleDownINTELItET_S0_S0_j - unsigned short +// _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j - half +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); \ + } +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char, a); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char, h); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(short, s); +__AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t); +// half +#ifdef cl_khr_fp16 +_CLC_DEF half _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j( + half current, half next, unsigned int delta) { + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); +} +#endif // cl_khr_fp16 +#undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32 + // 32-bit types. -// __spirv_SubgroupShuffleDownINTEL - unsigned int -// __spirv_SubgroupShuffleDownINTEL - float #define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ TYPE current, TYPE next, unsigned int delta) { \ @@ -455,6 +830,18 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float); #undef __AMDGCN_CLC_SUBGROUP_DOWN_I32 +// __spirv_SubgroupShuffleDownINTEL - unsigned int +// __spirv_SubgroupShuffleDownINTEL - float +#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); \ + } +__AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint, j); +__AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f); +#undef __AMDGCN_CLC_SUBGROUP_DOWN_I32 + // 64-bit types. #define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ @@ -473,6 +860,18 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong); __AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double); #undef __AMDGCN_CLC_SUBGROUP_DOWN_I64 +// double __spirv_SubgroupShuffleDownINTEL(double, unsigned int, int) +#define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE, MANGLED_TYPE_NAME) \ + _CLC_DEF TYPE \ + _Z32__spirv_SubgroupShuffleDownINTELI##MANGLED_TYPE_NAME##ET_S0_S0_j( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); \ + } +__AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long, l); +__AMDGCN_CLC_SUBGROUP_DOWN_I64(unsigned long, ulong, m); +__AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d); +#undef __AMDGCN_CLC_SUBGROUP_DOWN_I64 + // Vector types. #define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, NUM_ELEMS) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ @@ -537,3 +936,64 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double4, 4) __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double8, 8) __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double16, 16) #undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC + +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, MANGLED_SCALAR_TY, NUM_ELEMS) \ + _CLC_DEF TYPE \ + _Z32__spirv_SubgroupShuffleDownINTELIDv##NUM_ELEMS##_##MANGLED_SCALAR_TY##ET_S1_S1_j( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); \ + } +// [u]char +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char2, a, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char4, a, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char8, a, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char16, a, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar2, h, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar4, h, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar8, h, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uchar16, h, 16) +// [u]short +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short2, s, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short4, s, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short8, s, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(short16, s, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort2, t, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort4, t, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort8, t, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ushort16, t, 16) +// [u]int +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int2, i, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int4, i, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int8, i, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(int16, i, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint2, j, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint4, j, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint8, j, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(uint16, j, 16) +// [u]long +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long2, l, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long4, l, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long8, l, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(long16, l, 16) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong2, m, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong4, m, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong8, m, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(ulong16, m, 16) +// half +#ifdef cl_khr_fp16 +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half2, DF16_, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half4, DF16_, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half8, DF16_, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(half16, DF16_, 16) +#endif // cl_khr_fp16 +// float +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float2, f, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float4, f, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float8, f, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(float16, f, 16) +// double +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double2, d, 2) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double4, d, 4) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double8, d, 8) +__AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(double16, d, 16) +#undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC From 3518a82c0ce97731994cb0d0718d822625880e51 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Wed, 9 Apr 2025 18:01:49 -0700 Subject: [PATCH 6/6] clang-format --- .../lib/amdgcn-amdhsa/group/collectives.cl | 69 +++--- .../amdgcn-amdhsa/misc/sub_group_shuffle.cl | 207 +++++++++--------- 2 files changed, 139 insertions(+), 137 deletions(-) diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl index 5ff09386efab3..910541561950e 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/group/collectives.cl @@ -50,39 +50,38 @@ __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double"); #define __CLC_LOGICAL_OR(x, y) (x || y) #define __CLC_LOGICAL_AND(x, y) (x && y) -#define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ - uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ - /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ - for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \ - TYPE contribution = __spirv_SubgroupShuffleUpINTEL(x, x, o); \ - bool inactive = (sg_lid < o); \ - contribution = (inactive) ? IDENTITY : contribution; \ - x = OP(x, contribution); \ - } \ - /* For Reduce, broadcast result from highest active lane */ \ - TYPE result; \ - if (op == Reduce) { \ - result = __spirv_SubgroupShuffleINTEL( \ - x, __spirv_SubgroupSize() - 1); \ - *carry = result; \ - } /* For InclusiveScan, use results as computed */ \ - else if (op == InclusiveScan) { \ - result = x; \ - *carry = result; \ - } /* For ExclusiveScan, shift and prepend identity */ \ - else if (op == ExclusiveScan) { \ - *carry = x; \ - result = __spirv_SubgroupShuffleUpINTEL(x, x, 1); \ - if (sg_lid == 0) { \ - result = IDENTITY; \ - } \ - } \ +#define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ + uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ + /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ + for (int o = 1; o < __spirv_SubgroupMaxSize(); o *= 2) { \ + TYPE contribution = __spirv_SubgroupShuffleUpINTEL(x, x, o); \ + bool inactive = (sg_lid < o); \ + contribution = (inactive) ? IDENTITY : contribution; \ + x = OP(x, contribution); \ + } \ + /* For Reduce, broadcast result from highest active lane */ \ + TYPE result; \ + if (op == Reduce) { \ + result = __spirv_SubgroupShuffleINTEL(x, __spirv_SubgroupSize() - 1); \ + *carry = result; \ + } /* For InclusiveScan, use results as computed */ \ + else if (op == InclusiveScan) { \ + result = x; \ + *carry = result; \ + } /* For ExclusiveScan, shift and prepend identity */ \ + else if (op == ExclusiveScan) { \ + *carry = x; \ + result = __spirv_SubgroupShuffleUpINTEL(x, x, 1); \ + if (sg_lid == 0) { \ + result = IDENTITY; \ + } \ + } \ return result; -#define __CLC_SUBGROUP_COLLECTIVE(NAME, OP, TYPE, IDENTITY) \ - _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ - __clc__Subgroup, NAME)(uint op, TYPE x, TYPE * carry) { \ - __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ +#define __CLC_SUBGROUP_COLLECTIVE(NAME, OP, TYPE, IDENTITY) \ + _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ + __clc__Subgroup, NAME)(uint op, TYPE x, TYPE * carry) { \ + __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ } __CLC_SUBGROUP_COLLECTIVE(IAdd, __CLC_ADD, char, 0) @@ -356,7 +355,7 @@ long __clc__3d_to_linear_local_id(ulong3 id) { #define __CLC_GROUP_BROADCAST(TYPE, TYPE_MANGLED) \ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ - int scope, TYPE x, ulong local_id) { \ + int scope, TYPE x, ulong local_id) { \ if (scope == Subgroup) { \ return __spirv_SubgroupShuffleINTEL(x, local_id); \ } \ @@ -371,17 +370,17 @@ long __clc__3d_to_linear_local_id(ulong3 id) { return result; \ } \ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ - int scope, TYPE x, ulong2 local_id) { \ + int scope, TYPE x, ulong2 local_id) { \ ulong linear_local_id = __clc__2d_to_linear_local_id(local_id); \ return __spirv_GroupBroadcast(scope, x, linear_local_id); \ } \ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ - int scope, TYPE x, ulong3 local_id) { \ + int scope, TYPE x, ulong3 local_id) { \ ulong linear_local_id = __clc__3d_to_linear_local_id(local_id); \ return __spirv_GroupBroadcast(scope, x, linear_local_id); \ } \ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __spirv_GroupBroadcast( \ - int scope, TYPE x, uint local_id) { \ + int scope, TYPE x, uint local_id) { \ return __spirv_GroupBroadcast(scope, x, (ulong)local_id); \ } __CLC_GROUP_BROADCAST(char, a); diff --git a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl index 8745f43836b8a..13ac2ee2ddc24 100644 --- a/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl +++ b/libclc/libspirv/lib/amdgcn-amdhsa/misc/sub_group_shuffle.cl @@ -24,10 +24,10 @@ _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(int Data, unsigned int InvocationId) { } // Sub 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ - TYPE Data, unsigned int InvocationId) { \ - return __spirv_SubgroupShuffleINTEL((int)Data, InvocationId); \ +#define __AMDGCN_CLC_SUBGROUP_SUB_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleINTEL((int)Data, InvocationId); \ } __AMDGCN_CLC_SUBGROUP_SUB_I32(char); __AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned char); @@ -52,8 +52,8 @@ __AMDGCN_CLC_SUBGROUP_SUB_I32(unsigned short, t); #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleINTEL( - half Data, unsigned int InvocationId) { +_CLC_OVERLOAD _CLC_DEF half +__spirv_SubgroupShuffleINTEL(half Data, unsigned int InvocationId) { unsigned short tmp = __clc_as_ushort(Data); tmp = __spirv_SubgroupShuffleINTEL(tmp, InvocationId); return __clc_as_half(tmp); @@ -67,12 +67,12 @@ _CLC_DEF half _Z28__spirv_SubgroupShuffleINTELIDF16_ET_S0_j( #undef __AMDGCN_CLC_SUBGROUP_SUB_I32 // 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_I32(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ - TYPE Data, unsigned int InvocationId) { \ - return __builtin_astype( \ - __spirv_SubgroupShuffleINTEL(__clc_as_int(Data), InvocationId), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + return __builtin_astype( \ + __spirv_SubgroupShuffleINTEL(__clc_as_int(Data), InvocationId), \ + CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_I32(float, float); @@ -90,13 +90,13 @@ __AMDGCN_CLC_SUBGROUP_I32(float, float, f); #undef __AMDGCN_CLC_SUBGROUP_I32 // 64-bit types. -#define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ - TYPE Data, unsigned int InvocationId) { \ - int2 tmp = __clc_as_int2(Data); \ - tmp.lo = __spirv_SubgroupShuffleINTEL(tmp.lo, InvocationId); \ - tmp.hi = __spirv_SubgroupShuffleINTEL(tmp.hi, InvocationId); \ - return __builtin_astype(tmp, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + int2 tmp = __clc_as_int2(Data); \ + tmp.lo = __spirv_SubgroupShuffleINTEL(tmp.lo, InvocationId); \ + tmp.hi = __spirv_SubgroupShuffleINTEL(tmp.hi, InvocationId); \ + return __builtin_astype(tmp, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_I64(long, long); __AMDGCN_CLC_SUBGROUP_I64(unsigned long, ulong); @@ -117,14 +117,14 @@ __AMDGCN_CLC_SUBGROUP_I64(double, double, d); #undef __AMDGCN_CLC_SUBGROUP_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, NUM_ELEMS) \ - _CLC_OVERLOAD _CLC_DEF TYPE \ - __spirv_SubgroupShuffleINTEL(TYPE Data, unsigned int InvocationId) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = __spirv_SubgroupShuffleINTEL(Data[i], InvocationId); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleINTEL(Data[i], InvocationId); \ + } \ + return res; \ } // [u]char @@ -261,18 +261,18 @@ _Z31__spirv_SubgroupShuffleXorINTELIiET_S0_j(int Data, } // Sub 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE \ - __spirv_SubgroupShuffleXor(TYPE Data, unsigned int InvocationId) { \ - return __spirv_SubgroupShuffleXorINTEL((int)Data, InvocationId); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleXor( \ + TYPE Data, unsigned int InvocationId) { \ + return __spirv_SubgroupShuffleXorINTEL((int)Data, InvocationId); \ } __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(char); __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned char); __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(short); __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32(unsigned short); #ifdef cl_khr_fp16 -_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleXorINTEL( - half Data, unsigned int InvocationId) { +_CLC_OVERLOAD _CLC_DEF half +__spirv_SubgroupShuffleXorINTEL(half Data, unsigned int InvocationId) { unsigned short tmp = __clc_as_ushort(Data); tmp = (unsigned short)__spirv_SubgroupShuffleXorINTEL(tmp, InvocationId); return __clc_as_half(tmp); @@ -304,12 +304,12 @@ _CLC_DEF half _Z31__spirv_SubgroupShuffleXorINTELIDF16_ET_S0_j( #undef __AMDGCN_CLC_SUBGROUP_XOR_SUB_I32 // 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE \ - __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ - return __builtin_astype(__spirv_SubgroupShuffleXorINTEL( \ - __clc_as_int(Data), InvocationId), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleXorINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + return __builtin_astype( \ + __spirv_SubgroupShuffleXorINTEL(__clc_as_int(Data), InvocationId), \ + CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_XOR_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_XOR_I32(float, float); @@ -328,13 +328,13 @@ __AMDGCN_CLC_SUBGROUP_XOR_I32(float, float, f); #undef __AMDGCN_CLC_SUBGROUP_XOR_I32 // 64-bit types. -#define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE \ - __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ - int2 tmp = __clc_as_int2(Data); \ - tmp.lo = __spirv_SubgroupShuffleXorINTEL(tmp.lo, InvocationId); \ - tmp.hi = __spirv_SubgroupShuffleXorINTEL(tmp.hi, InvocationId); \ - return __builtin_astype(tmp, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_XOR_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleXorINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + int2 tmp = __clc_as_int2(Data); \ + tmp.lo = __spirv_SubgroupShuffleXorINTEL(tmp.lo, InvocationId); \ + tmp.hi = __spirv_SubgroupShuffleXorINTEL(tmp.hi, InvocationId); \ + return __builtin_astype(tmp, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_XOR_I64(long, long); __AMDGCN_CLC_SUBGROUP_XOR_I64(unsigned long, ulong); @@ -357,14 +357,14 @@ __AMDGCN_CLC_SUBGROUP_XOR_I64(double, double, d); // Vector types. -#define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, NUM_ELEMS) \ - _CLC_OVERLOAD _CLC_DEF TYPE \ - __spirv_SubgroupShuffleXorINTEL(TYPE Data, unsigned int InvocationId) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = __spirv_SubgroupShuffleXorINTEL(Data[i], InvocationId); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleXorINTEL( \ + TYPE Data, unsigned int InvocationId) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleXorINTEL(Data[i], InvocationId); \ + } \ + return res; \ } // [u]char __AMDGCN_CLC_SUBGROUP_XOR_TO_VEC(char2, 2) @@ -523,8 +523,9 @@ __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(short); __AMDGCN_CLC_SUBGROUP_UP_SUB_I32(unsigned short); // half #ifdef cl_khr_fp16 -_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleUpINTEL( - half previous, half current, unsigned int delta) { +_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleUpINTEL(half previous, + half current, + unsigned int delta) { unsigned short tmpP = __clc_as_ushort(previous); unsigned short tmpC = __clc_as_ushort(current); tmpC = __spirv_SubgroupShuffleUpINTEL(tmpP, tmpC, delta); @@ -558,12 +559,13 @@ _CLC_DEF half _Z30__spirv_SubgroupShuffleUpINTELIDF16_ET_S0_S0_j( #undef __AMDGCN_CLC_SUBGROUP_UP_SUB_I32 // 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ - TYPE previous, TYPE current, unsigned int delta) { \ - return __builtin_astype(__spirv_SubgroupShuffleUpINTEL( \ - __clc_as_int(previous), __clc_as_int(current), delta), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_UP_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + return __builtin_astype( \ + __spirv_SubgroupShuffleUpINTEL(__clc_as_int(previous), \ + __clc_as_int(current), delta), \ + CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_UP_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_UP_I32(float, float); @@ -582,17 +584,17 @@ __AMDGCN_CLC_SUBGROUP_UP_I32(float, float, f); #undef __AMDGCN_CLC_SUBGROUP_UP_I32 // 64-bit types. -#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ - TYPE previous, TYPE current, unsigned int delta) { \ - int2 tmp_previous = __clc_as_int2(previous); \ - int2 tmp_current = __clc_as_int2(current); \ - int2 ret; \ - ret.lo = __spirv_SubgroupShuffleUpINTEL( \ - tmp_previous.lo, tmp_current.lo, delta); \ - ret.hi = __spirv_SubgroupShuffleUpINTEL( \ - tmp_previous.hi, tmp_current.hi, delta); \ - return __builtin_astype(ret, CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_UP_I64(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ + TYPE previous, TYPE current, unsigned int delta) { \ + int2 tmp_previous = __clc_as_int2(previous); \ + int2 tmp_current = __clc_as_int2(current); \ + int2 ret; \ + ret.lo = __spirv_SubgroupShuffleUpINTEL(tmp_previous.lo, tmp_current.lo, \ + delta); \ + ret.hi = __spirv_SubgroupShuffleUpINTEL(tmp_previous.hi, tmp_current.hi, \ + delta); \ + return __builtin_astype(ret, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_UP_I64(long, long); __AMDGCN_CLC_SUBGROUP_UP_I64(unsigned long, ulong); @@ -616,7 +618,7 @@ __AMDGCN_CLC_SUBGROUP_UP_I64(double, double, d); // Vector types. #define __AMDGCN_CLC_SUBGROUP_UP_TO_VEC(TYPE, NUM_ELEMS) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleUpINTEL( \ - TYPE previous, TYPE current, unsigned int delta) { \ + TYPE previous, TYPE current, unsigned int delta) { \ TYPE res; \ for (int i = 0; i < NUM_ELEMS; ++i) { \ res[i] = __spirv_SubgroupShuffleUpINTEL(previous[i], current[i], delta); \ @@ -769,10 +771,10 @@ _Z32__spirv_SubgroupShuffleDownINTELIiET_S0_S0_j(int current, int next, } // Sub 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ - TYPE current, TYPE next, unsigned int delta) { \ - return __spirv_SubgroupShuffleDownINTEL((int)current, (int)next, delta); \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __spirv_SubgroupShuffleDownINTEL((int)current, (int)next, delta); \ } __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(char); __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned char); @@ -780,8 +782,8 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(short); __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short); // half #ifdef cl_khr_fp16 -_CLC_OVERLOAD _CLC_DEF half __spirv_SubgroupShuffleDownINTEL( - half current, half next, unsigned int delta) { +_CLC_OVERLOAD _CLC_DEF half +__spirv_SubgroupShuffleDownINTEL(half current, half next, unsigned int delta) { unsigned short tmpC = __clc_as_ushort(current); unsigned short tmpN = __clc_as_ushort(next); tmpC = __spirv_SubgroupShuffleDownINTEL(tmpC, tmpN, delta); @@ -809,18 +811,19 @@ __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32(unsigned short, t); #ifdef cl_khr_fp16 _CLC_DEF half _Z32__spirv_SubgroupShuffleDownINTELIDF16_ET_S0_S0_j( half current, half next, unsigned int delta) { - return __spirv_SubgroupShuffleDownINTEL(current, next, delta); + return __spirv_SubgroupShuffleDownINTEL(current, next, delta); } #endif // cl_khr_fp16 #undef __AMDGCN_CLC_SUBGROUP_DOWN_TO_I32 // 32-bit types. -#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ - TYPE current, TYPE next, unsigned int delta) { \ - return __builtin_astype(__spirv_SubgroupShuffleDownINTEL( \ - __clc_as_int(current), __clc_as_int(next), delta), \ - CAST_TYPE); \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_I32(TYPE, CAST_TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + return __builtin_astype( \ + __spirv_SubgroupShuffleDownINTEL(__clc_as_int(current), \ + __clc_as_int(next), delta), \ + CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_DOWN_I32(unsigned int, uint); __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float); @@ -841,14 +844,14 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I32(float, float, f); // 64-bit types. #define __AMDGCN_CLC_SUBGROUP_DOWN_I64(TYPE, CAST_TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ - TYPE current, TYPE next, unsigned int delta) { \ - int2 tmp_current = __clc_as_int2(current); \ - int2 tmp_next = __clc_as_int2(next); \ + TYPE current, TYPE next, unsigned int delta) { \ + int2 tmp_current = __clc_as_int2(current); \ + int2 tmp_next = __clc_as_int2(next); \ int2 ret; \ - ret.lo = __spirv_SubgroupShuffleDownINTEL( \ - tmp_current.lo, tmp_next.lo, delta); \ - ret.hi = __spirv_SubgroupShuffleDownINTEL( \ - tmp_current.hi, tmp_next.hi, delta); \ + ret.lo = \ + __spirv_SubgroupShuffleDownINTEL(tmp_current.lo, tmp_next.lo, delta); \ + ret.hi = \ + __spirv_SubgroupShuffleDownINTEL(tmp_current.hi, tmp_next.hi, delta); \ return __builtin_astype(ret, CAST_TYPE); \ } __AMDGCN_CLC_SUBGROUP_DOWN_I64(long, long); @@ -869,14 +872,14 @@ __AMDGCN_CLC_SUBGROUP_DOWN_I64(double, double, d); #undef __AMDGCN_CLC_SUBGROUP_DOWN_I64 // Vector types. -#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, NUM_ELEMS) \ - _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ - TYPE current, TYPE next, unsigned int delta) { \ - TYPE res; \ - for (int i = 0; i < NUM_ELEMS; ++i) { \ - res[i] = __spirv_SubgroupShuffleDownINTEL(current[i], next[i], delta); \ - } \ - return res; \ +#define __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(TYPE, NUM_ELEMS) \ + _CLC_OVERLOAD _CLC_DEF TYPE __spirv_SubgroupShuffleDownINTEL( \ + TYPE current, TYPE next, unsigned int delta) { \ + TYPE res; \ + for (int i = 0; i < NUM_ELEMS; ++i) { \ + res[i] = __spirv_SubgroupShuffleDownINTEL(current[i], next[i], delta); \ + } \ + return res; \ } // [u]char __AMDGCN_CLC_SUBGROUP_DOWN_TO_VEC(char2, 2)