From de355120e7875e8008d0b26d8b78ff468ce588ff Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 10 Apr 2024 12:25:02 +0200 Subject: [PATCH 1/2] [Clang] Add __builtin_selectvector --- clang/docs/LanguageExtensions.rst | 20 + clang/include/clang/Basic/Builtins.td | 6 + clang/include/clang/Basic/BuiltinsX86.def | 24 - .../clang/Basic/DiagnosticSemaKinds.td | 4 +- clang/lib/CodeGen/CGBuiltin.cpp | 31 +- clang/lib/Headers/avx512bf16intrin.h | 24 +- clang/lib/Headers/avx512bitalgintrin.h | 12 +- clang/lib/Headers/avx512bwintrin.h | 782 ++-- clang/lib/Headers/avx512cdintrin.h | 48 +- clang/lib/Headers/avx512dqintrin.h | 268 +- clang/lib/Headers/avx512fintrin.h | 1915 ++++----- clang/lib/Headers/avx512fp16intrin.h | 175 +- clang/lib/Headers/avx512ifmaintrin.h | 24 +- clang/lib/Headers/avx512ifmavlintrin.h | 49 +- clang/lib/Headers/avx512vbmi2intrin.h | 170 +- clang/lib/Headers/avx512vbmiintrin.h | 43 +- clang/lib/Headers/avx512vbmivlintrin.h | 85 +- clang/lib/Headers/avx512vlbf16intrin.h | 48 +- clang/lib/Headers/avx512vlbitalgintrin.h | 24 +- clang/lib/Headers/avx512vlbwintrin.h | 1564 ++++--- clang/lib/Headers/avx512vlcdintrin.h | 96 +- clang/lib/Headers/avx512vldqintrin.h | 380 +- clang/lib/Headers/avx512vlfp16intrin.h | 435 +- clang/lib/Headers/avx512vlintrin.h | 3780 ++++++++--------- clang/lib/Headers/avx512vlvbmi2intrin.h | 336 +- clang/lib/Headers/avx512vlvnniintrin.h | 96 +- clang/lib/Headers/avx512vnniintrin.h | 48 +- clang/lib/Headers/avx512vpopcntdqintrin.h | 10 +- clang/lib/Headers/avx512vpopcntdqvlintrin.h | 20 +- clang/lib/Headers/gfniintrin.h | 67 +- clang/lib/Sema/SemaChecking.cpp | 56 + clang/test/CodeGen/X86/avx512dq-builtins.c | 25 +- .../X86/avx512f-builtins-constrained.c | 9 +- clang/test/CodeGen/X86/avx512f-builtins.c | 44 +- clang/test/CodeGen/X86/avx512fp16-builtins.c | 12 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 132 +- .../test/CodeGen/X86/avx512vlfp16-builtins.c | 16 +- clang/test/Sema/builtin-selectvector.c | 18 + 38 files changed, 5371 insertions(+), 5525 deletions(-) create mode 100644 clang/test/Sema/builtin-selectvector.c diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 96691b45d63a3..6513676438ffb 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3019,6 +3019,26 @@ C-style cast applied to each element of the first argument. Query for this feature with ``__has_builtin(__builtin_convertvector)``. +``__builtin_selectvector`` +-------------------------- + +``__builtin_selectvector`` is used to express generic vector element selection. + +**Signature**: + +.. code-block:: c++ + + template + simd_vec __builtin_selectvector(simd_vec lhs, simd_vec rhs, + simd_vec cond) + +**Description**: + +The returned vector is equivalent to +``simd_vec{cond[0] ? rhs[0] : lhs[0], ..., cond[N - 1] ? rhs[N - 1] : lhs[N - 1]}``. + +Query for this feature with ``__has_builtin(__builtin_selectvector)``. + ``__builtin_bitreverse`` ------------------------ diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index d6ceb450bd106..279330d9b5251 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1176,6 +1176,12 @@ def ConvertVector : Builtin { let Prototype = "void(...)"; } +def SelectVector : Builtin { + let Spellings = ["__builtin_selectvector"]; + let Attributes = [NoThrow, Const, CustomTypeChecking]; + let Prototype = "void(...)"; +} + def AllocaUninitialized : Builtin { let Spellings = ["__builtin_alloca_uninitialized"]; let Attributes = [FunctionWithBuiltinPrefix, NoThrow]; diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index eafcc219c1096..2e099b3ab4f05 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1973,30 +1973,6 @@ TARGET_BUILTIN(__builtin_ia32_vfcmulcph256_mask, "V8fV8fV8fV8fUc", "ncV:256:", TARGET_BUILTIN(__builtin_ia32_vfcmulcph512_mask, "V16fV16fV16fV16fUsIi", "ncV:512:", "avx512fp16,evex512") // generic select intrinsics -TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cUOiV64cV64c", "ncV:512:", "avx512bw,evex512") -TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "ncV:128:", "avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "ncV:256:", "avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "ncV:512:", "avx512bw,evex512") -TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f,evex512") -TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16,evex512") -TARGET_BUILTIN(__builtin_ia32_selectpbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectpbf_256, "V16yUsV16yV16y", "ncV:256:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectpbf_512, "V32yUiV32yV32y", "ncV:512:", "avx512bf16,evex512") -TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f,evex512") -TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "ncV:128:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "ncV:256:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "ncV:512:", "avx512f,evex512") -TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl") -TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_selectsbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16") TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f") diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 774d2b53a3825..7c2222fe51203 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12035,7 +12035,9 @@ def err_builtin_invalid_arg_type: Error < "a floating point type|" "a vector of integers|" "an unsigned integer|" - "an 'int'}1 (was %2)">; + "an 'int'|" + "a vector of bools" + "}1 (was %2)">; def err_builtin_matrix_disabled: Error< "matrix types extension is disabled. Pass -fenable-matrix to enable it">; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index c7b219dcfcec5..487f9a2099eb9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3744,6 +3744,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(Result); } + case Builtin::BI__builtin_selectvector: { + return RValue::get(Builder.CreateSelect(EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)))); + } + case Builtin::BI__builtin_elementwise_abs: { Value *Result; QualType QT = E->getArg(0)->getType(); @@ -15513,31 +15519,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_prorvq256: case X86::BI__builtin_ia32_prorvq512: return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true); - case X86::BI__builtin_ia32_selectb_128: - case X86::BI__builtin_ia32_selectb_256: - case X86::BI__builtin_ia32_selectb_512: - case X86::BI__builtin_ia32_selectw_128: - case X86::BI__builtin_ia32_selectw_256: - case X86::BI__builtin_ia32_selectw_512: - case X86::BI__builtin_ia32_selectd_128: - case X86::BI__builtin_ia32_selectd_256: - case X86::BI__builtin_ia32_selectd_512: - case X86::BI__builtin_ia32_selectq_128: - case X86::BI__builtin_ia32_selectq_256: - case X86::BI__builtin_ia32_selectq_512: - case X86::BI__builtin_ia32_selectph_128: - case X86::BI__builtin_ia32_selectph_256: - case X86::BI__builtin_ia32_selectph_512: - case X86::BI__builtin_ia32_selectpbf_128: - case X86::BI__builtin_ia32_selectpbf_256: - case X86::BI__builtin_ia32_selectpbf_512: - case X86::BI__builtin_ia32_selectps_128: - case X86::BI__builtin_ia32_selectps_256: - case X86::BI__builtin_ia32_selectps_512: - case X86::BI__builtin_ia32_selectpd_128: - case X86::BI__builtin_ia32_selectpd_256: - case X86::BI__builtin_ia32_selectpd_512: - return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); case X86::BI__builtin_ia32_selectsh_128: case X86::BI__builtin_ia32_selectsbf_128: case X86::BI__builtin_ia32_selectss_128: diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h index b28d2e243f2cb..1c32831a8cc57 100644 --- a/clang/lib/Headers/avx512bf16intrin.h +++ b/clang/lib/Headers/avx512bf16intrin.h @@ -77,9 +77,9 @@ _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { /// conversion of __B, and higher 256 bits come from conversion of __A. static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, - (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), - (__v32bf)__W); + return (__m512bh)__builtin_selectvector( + (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), (__v32bf)__W, + __builtin_bit_cast(__vecmask32, __U)); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -99,9 +99,9 @@ _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { /// conversion of __B, and higher 256 bits come from conversion of __A. static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, - (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), - (__v32bf)_mm512_setzero_si512()); + return (__m512bh)__builtin_selectvector( + (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), (__v32bf)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } /// Convert Packed Single Data to Packed BF16 Data. @@ -200,9 +200,9 @@ _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { /// __A, __B and __D static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), - (__v16sf)__D); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), (__v16sf)__D, + __builtin_bit_cast(__vecmask16, __U)); } /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. @@ -224,9 +224,9 @@ _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { /// __A, __B and __D static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), - (__v16sf)_mm512_setzero_si512()); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), (__v16sf)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } /// Convert Packed BF16 Data to Packed float Data. diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h index bad265ceb7db2..f4e31c287af18 100644 --- a/clang/lib/Headers/avx512bitalgintrin.h +++ b/clang/lib/Headers/avx512bitalgintrin.h @@ -29,9 +29,9 @@ _mm512_popcnt_epi16(__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { - return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi) _mm512_popcnt_epi16(__B), - (__v32hi) __A); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_popcnt_epi16(__B), + (__v32hi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -51,9 +51,9 @@ _mm512_popcnt_epi8(__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { - return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, - (__v64qi) _mm512_popcnt_epi8(__B), - (__v64qi) __A); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_popcnt_epi8(__B), + (__v64qi)__A, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index c854720de6a65..ba77f979da1f8 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -369,16 +369,16 @@ _mm512_add_epi8 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_add_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_add_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -388,16 +388,16 @@ _mm512_sub_epi8 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_sub_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_sub_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -407,16 +407,16 @@ _mm512_add_epi16 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_add_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_add_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -426,16 +426,16 @@ _mm512_sub_epi16 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sub_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sub_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -445,32 +445,30 @@ _mm512_mullo_epi16 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mullo_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mullo_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __W, - (__v64qi) __A); + return (__m512i)__builtin_selectvector((__v64qi)__W, (__v64qi)__A, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __W, - (__v32hi) __A); + return (__m512i)__builtin_selectvector((__v32hi)__W, (__v32hi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -482,17 +480,17 @@ _mm512_abs_epi8 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_abs_epi8(__A), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_abs_epi8(__A), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_abs_epi8(__A), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_abs_epi8(__A), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -504,17 +502,17 @@ _mm512_abs_epi16 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_abs_epi16(__A), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_abs_epi16(__A), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_abs_epi16(__A), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_abs_epi16(__A), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -526,17 +524,17 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packs_epi32(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packs_epi32(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -548,17 +546,17 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packs_epi16(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packs_epi16(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -570,17 +568,17 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packus_epi32(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packus_epi32(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -592,17 +590,17 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packus_epi16(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packus_epi16(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -614,17 +612,17 @@ _mm512_adds_epi8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -636,17 +634,17 @@ _mm512_adds_epi16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -658,17 +656,17 @@ _mm512_adds_epu8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epu8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -680,17 +678,17 @@ _mm512_adds_epu16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -703,17 +701,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_avg_epu8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_avg_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -726,17 +724,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_avg_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_avg_epu16(__A, __B), - (__v32hi) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -748,17 +746,17 @@ _mm512_max_epi8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -770,18 +768,18 @@ _mm512_max_epi16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -793,17 +791,17 @@ _mm512_max_epu8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epu8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -815,17 +813,17 @@ _mm512_max_epu16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -837,17 +835,17 @@ _mm512_min_epi8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -859,17 +857,17 @@ _mm512_min_epi16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -881,17 +879,17 @@ _mm512_min_epu8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epu8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -903,17 +901,17 @@ _mm512_min_epu16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -925,17 +923,17 @@ _mm512_shuffle_epi8(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_shuffle_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_shuffle_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -947,17 +945,17 @@ _mm512_subs_epi8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -969,17 +967,17 @@ _mm512_subs_epi16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -991,17 +989,17 @@ _mm512_subs_epu8 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epu8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1013,17 +1011,17 @@ _mm512_subs_epu16 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1037,27 +1035,27 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)__A); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)__I); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__I, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1069,17 +1067,17 @@ _mm512_mulhrs_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhrs_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhrs_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1092,17 +1090,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1114,17 +1112,17 @@ _mm512_mulhi_epu16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1135,16 +1133,16 @@ _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi)_mm512_maddubs_epi16(__X, __Y), - (__v32hi)__W); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_maddubs_epi16(__X, __Y), (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi)_mm512_maddubs_epi16(__X, __Y), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_maddubs_epi16(__X, __Y), (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1154,16 +1152,16 @@ _mm512_madd_epi16(__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_madd_epi16(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_madd_epi16(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS512 @@ -1270,16 +1268,16 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpackhi_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpackhi_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1297,16 +1295,16 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpackhi_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpackhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1332,16 +1330,16 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpacklo_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpacklo_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1359,16 +1357,16 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpacklo_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpacklo_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1382,17 +1380,17 @@ _mm512_cvtepi8_epi16(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepi8_epi16(__A), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepi8_epi16(__A), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1404,51 +1402,44 @@ _mm512_cvtepu8_epi16(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepu8_epi16(__A), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepu8_epi16(__A), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } - #define _mm512_shufflehi_epi16(A, imm) \ ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))) -#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W))) +#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_shufflehi_epi16((A), (imm)), (__v32hi)(__m512i)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) -#define _mm512_maskz_shufflehi_epi16(U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512())) +#define _mm512_maskz_shufflehi_epi16(U, A, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_shufflehi_epi16((A), (imm)), \ + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_shufflelo_epi16(A, imm) \ ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))) +#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_shufflelo_epi16((A), (imm)), (__v32hi)(__m512i)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) -#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W))) - - -#define _mm512_maskz_shufflelo_epi16(U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512())) +#define _mm512_maskz_shufflelo_epi16(U, A, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_shufflelo_epi16((A), (imm)), \ + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sllv_epi16(__m512i __A, __m512i __B) @@ -1459,17 +1450,17 @@ _mm512_sllv_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sllv_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sllv_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1481,17 +1472,17 @@ _mm512_sll_epi16(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sll_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sll_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1504,17 +1495,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_slli_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_slli_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_bslli_epi128(a, imm) \ @@ -1529,17 +1520,17 @@ _mm512_srlv_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srlv_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srlv_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1551,17 +1542,17 @@ _mm512_srav_epi16(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srav_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srav_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1573,17 +1564,17 @@ _mm512_sra_epi16(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sra_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sra_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1596,17 +1587,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srai_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srai_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1618,17 +1609,17 @@ _mm512_srl_epi16(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srl_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srl_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1641,17 +1632,17 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srli_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_srli_epi16(__A, __B), + (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B), + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_bsrli_epi128(a, imm) \ @@ -1660,49 +1651,47 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __A, - (__v32hi) __W); + return (__m512i)__builtin_selectvector((__v32hi)__A, (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __A, - (__v32hi) _mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v32hi)__A, + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __A, - (__v64qi) __W); + return (__m512i)__builtin_selectvector((__v64qi)__A, (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __A, - (__v64qi) _mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v64qi)__A, + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_selectb_512(__M, - (__v64qi)_mm512_set1_epi8(__A), - (__v64qi) __O); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_set1_epi8(__A), + (__v64qi)__O, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_set1_epi8(__A), - (__v64qi) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_set1_epi8(__A), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A, @@ -1896,33 +1885,33 @@ _mm512_broadcastb_epi8 (__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_broadcastb_epi8(__A), - (__v64qi) __O); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_broadcastb_epi8(__A), + (__v64qi)__O, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_broadcastb_epi8(__A), - (__v64qi) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v64qi)_mm512_broadcastb_epi8(__A), + (__v64qi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_set1_epi16(__A), - (__v32hi) __O); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_set1_epi16(__A), + (__v32hi)__O, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_set1_epi16(__A), - (__v32hi) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_set1_epi16(__A), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1936,17 +1925,17 @@ _mm512_broadcastw_epi16 (__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_broadcastw_epi16(__A), - (__v32hi) __O); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_broadcastw_epi16(__A), + (__v32hi)__O, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_broadcastw_epi16(__A), - (__v32hi) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v32hi)_mm512_broadcastw_epi16(__A), + (__v32hi)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1959,47 +1948,48 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_permutexvar_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_permutexvar_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_permutexvar_epi16(__A, __B), - (__v32hi)__W); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_permutexvar_epi16(__A, __B), (__v32hi)__W, + __builtin_bit_cast(__vecmask32, __M)); } #define _mm512_alignr_epi8(A, B, N) \ ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), (int)(N))) -#define _mm512_mask_alignr_epi8(W, U, A, B, N) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)(W))) +#define _mm512_mask_alignr_epi8(W, U, A, B, N) \ + ((__m512i)__builtin_selectvector( \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), (__v64qi)(__m512i)(W), \ + __builtin_bit_cast(__vecmask64, (U)))) -#define _mm512_maskz_alignr_epi8(U, A, B, N) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)_mm512_setzero_si512())) +#define _mm512_maskz_alignr_epi8(U, A, B, N) \ + ((__m512i)__builtin_selectvector( \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask64, (U)))) #define _mm512_dbsad_epu8(A, B, imm) \ ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), (int)(imm))) -#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)(__m512i)(W))) +#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), (__v32hi)(__m512i)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) -#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)_mm512_setzero_si512())) +#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sad_epu8 (__m512i __A, __m512i __B) diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h index 33b552f6fe6ad..182f1372c6090 100644 --- a/clang/lib/Headers/avx512cdintrin.h +++ b/clang/lib/Headers/avx512cdintrin.h @@ -28,17 +28,17 @@ _mm512_conflict_epi64 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_conflict_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_conflict_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_conflict_epi64(__A), - (__v8di)_mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_conflict_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -50,17 +50,17 @@ _mm512_conflict_epi32 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_conflict_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_conflict_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_conflict_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_conflict_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -72,17 +72,17 @@ _mm512_lzcnt_epi32 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_lzcnt_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_lzcnt_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -94,17 +94,17 @@ _mm512_lzcnt_epi64 (__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_lzcnt_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_lzcnt_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index 88b48e3a32070..52cc2706bc83b 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -155,16 +155,16 @@ _mm512_mullo_epi64 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullo_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullo_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -174,16 +174,16 @@ _mm512_xor_pd(__m512d __A, __m512d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_xor_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_xor_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_xor_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_xor_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -193,16 +193,16 @@ _mm512_xor_ps (__m512 __A, __m512 __B) { static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_xor_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_xor_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -212,16 +212,16 @@ _mm512_or_pd(__m512d __A, __m512d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_or_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_or_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_or_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_or_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -231,16 +231,16 @@ _mm512_or_ps(__m512 __A, __m512 __B) { static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_or_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_or_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -250,16 +250,16 @@ _mm512_and_pd(__m512d __A, __m512d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_and_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_and_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_and_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_and_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -269,16 +269,16 @@ _mm512_and_ps(__m512 __A, __m512 __B) { static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_and_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_and_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -288,16 +288,16 @@ _mm512_andnot_pd(__m512d __A, __m512d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_andnot_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_andnot_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -307,16 +307,16 @@ _mm512_andnot_ps(__m512 __A, __m512 __B) { static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_andnot_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_andnot_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -483,16 +483,16 @@ _mm512_cvtepi64_pd (__m512i __A) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepi64_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepi64_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_cvt_roundepi64_pd(A, R) \ @@ -713,16 +713,16 @@ _mm512_cvtepu64_pd (__m512i __A) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepu64_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepu64_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_cvt_roundepu64_pd(A, R) \ @@ -1088,17 +1088,17 @@ _mm512_broadcast_f32x2 (__m128 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x2(__A), - (__v16sf)__O); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x2(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -1112,17 +1112,17 @@ _mm512_broadcast_f32x8(__m256 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x8(__A), - (__v16sf)__O); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x8(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -1135,17 +1135,17 @@ _mm512_broadcast_f64x2(__m128d __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x2(__A), - (__v8df)__O); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x2(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1159,17 +1159,17 @@ _mm512_broadcast_i32x2 (__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x2(__A), - (__v16si)__O); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x2(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1183,17 +1183,17 @@ _mm512_broadcast_i32x8(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x8(__A), - (__v16si)__O); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x8(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1206,17 +1206,17 @@ _mm512_broadcast_i64x2(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x2(__A), - (__v8di)__O); + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x2(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } #define _mm512_extractf32x8_ps(A, imm) \ @@ -1289,57 +1289,57 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ (__v8sf)(__m256)(B), (int)(imm))) -#define _mm512_mask_insertf32x8(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_insertf32x8(W, U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_insertf32x8(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_insertf32x8(U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_insertf64x2(A, B, imm) \ ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ (__v2df)(__m128d)(B), (int)(imm))) -#define _mm512_mask_insertf64x2(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_insertf64x2(W, U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_insertf64x2(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_insertf64x2(U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_inserti32x8(A, B, imm) \ ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ (__v8si)(__m256i)(B), (int)(imm))) -#define _mm512_mask_inserti32x8(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_inserti32x8(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_inserti32x8(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_inserti32x8(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_inserti64x2(A, B, imm) \ ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ (__v2di)(__m128i)(B), (int)(imm))) -#define _mm512_mask_inserti64x2(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_inserti64x2(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_inserti64x2(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_inserti64x2(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_mask_fpclass_ps_mask(U, A, imm) \ ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 4f172c74b31cb..99d3c1a81267b 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -41,6 +41,22 @@ typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))) typedef unsigned char __mmask8; typedef unsigned short __mmask16; +#ifdef __cplusplus +typedef bool __vecmask2 __attribute__((__ext_vector_type__(2))); +typedef bool __vecmask4 __attribute__((__ext_vector_type__(4))); +typedef bool __vecmask8 __attribute__((__ext_vector_type__(8))); +typedef bool __vecmask16 __attribute__((__ext_vector_type__(16))); +typedef bool __vecmask32 __attribute__((__ext_vector_type__(32))); +typedef bool __vecmask64 __attribute__((__ext_vector_type__(64))); +#else +typedef _Bool __vecmask2 __attribute__((__ext_vector_type__(2))); +typedef _Bool __vecmask4 __attribute__((__ext_vector_type__(4))); +typedef _Bool __vecmask8 __attribute__((__ext_vector_type__(8))); +typedef _Bool __vecmask16 __attribute__((__ext_vector_type__(16))); +typedef _Bool __vecmask32 __attribute__((__ext_vector_type__(32))); +typedef _Bool __vecmask64 __attribute__((__ext_vector_type__(64))); +#endif + /* Rounding mode macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -219,17 +235,17 @@ _mm512_broadcastd_epi32 (__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si) _mm512_broadcastd_epi32(__A), - (__v16si) __O); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcastd_epi32(__A), + (__v16si)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si) _mm512_broadcastd_epi32(__A), - (__v16si) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcastd_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -242,21 +258,19 @@ _mm512_broadcastq_epi64 (__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di) _mm512_broadcastq_epi64(__A), - (__v8di) __O); - + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcastq_epi64(__A), + (__v8di)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di) _mm512_broadcastq_epi64(__A), - (__v8di) _mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcastq_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } - static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_setzero_ps(void) { @@ -320,9 +334,9 @@ _mm512_set1_epi32(int __s) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si)_mm512_set1_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_set1_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -334,9 +348,9 @@ _mm512_set1_epi64(long long __d) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_set1_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_set1_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -649,9 +663,9 @@ _mm512_and_epi32(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si) _mm512_and_epi32(__a, __b), - (__v16si) __src); + return (__m512i)__builtin_selectvector((__v16si)_mm512_and_epi32(__a, __b), + (__v16si)__src, + __builtin_bit_cast(__vecmask16, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -670,9 +684,9 @@ _mm512_and_epi64(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, - (__v8di) _mm512_and_epi64(__a, __b), - (__v8di) __src); + return (__m512i)__builtin_selectvector((__v8di)_mm512_and_epi64(__a, __b), + (__v8di)__src, + __builtin_bit_cast(__vecmask8, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -697,9 +711,9 @@ _mm512_andnot_epi32 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_andnot_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_andnot_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -718,9 +732,9 @@ _mm512_andnot_epi64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_andnot_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_andnot_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -739,9 +753,9 @@ _mm512_or_epi32(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si)_mm512_or_epi32(__a, __b), - (__v16si)__src); + return (__m512i)__builtin_selectvector((__v16si)_mm512_or_epi32(__a, __b), + (__v16si)__src, + __builtin_bit_cast(__vecmask16, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -759,9 +773,9 @@ _mm512_or_epi64(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, - (__v8di)_mm512_or_epi64(__a, __b), - (__v8di)__src); + return (__m512i)__builtin_selectvector((__v8di)_mm512_or_epi64(__a, __b), + (__v8di)__src, + __builtin_bit_cast(__vecmask8, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -779,9 +793,9 @@ _mm512_xor_epi32(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si)_mm512_xor_epi32(__a, __b), - (__v16si)__src); + return (__m512i)__builtin_selectvector((__v16si)_mm512_xor_epi32(__a, __b), + (__v16si)__src, + __builtin_bit_cast(__vecmask16, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -799,9 +813,9 @@ _mm512_xor_epi64(__m512i __a, __m512i __b) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, - (__v8di)_mm512_xor_epi64(__a, __b), - (__v8di)__src); + return (__m512i)__builtin_selectvector((__v8di)_mm512_xor_epi64(__a, __b), + (__v8di)__src, + __builtin_bit_cast(__vecmask8, __k)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -875,17 +889,17 @@ _mm512_add_epi64 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_add_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_add_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_add_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_add_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -897,17 +911,17 @@ _mm512_sub_epi64 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sub_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sub_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -919,17 +933,17 @@ _mm512_add_epi32 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_add_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_add_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_add_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_add_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -941,32 +955,32 @@ _mm512_sub_epi32 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sub_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sub_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm512_max_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_max_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)(W))) +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_max_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_max_pd(__m512d __A, __m512d __B) @@ -978,32 +992,32 @@ _mm512_max_pd(__m512d __A, __m512d __B) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_max_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_max_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_max_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_max_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_max_round_ps(A, B, R) \ ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_max_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)(W))) +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_max_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_max_ps(__m512 __A, __m512 __B) @@ -1015,17 +1029,17 @@ _mm512_max_ps(__m512 __A, __m512 __B) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_max_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_max_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -1110,17 +1124,17 @@ _mm512_max_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_max_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_max_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1132,17 +1146,17 @@ _mm512_max_epu32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epu32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_max_epu32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epu32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_max_epu32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1154,17 +1168,17 @@ _mm512_max_epi64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_max_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_max_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1176,32 +1190,32 @@ _mm512_max_epu64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epu64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_max_epu64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epu64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_max_epu64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } #define _mm512_min_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_min_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)(W))) +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_min_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_min_pd(__m512d __A, __m512d __B) @@ -1213,32 +1227,32 @@ _mm512_min_pd(__m512d __A, __m512d __B) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_min_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_min_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_min_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_min_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_min_round_ps(A, B, R) \ ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_min_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)(W))) +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_min_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_min_ps(__m512 __A, __m512 __B) @@ -1250,17 +1264,17 @@ _mm512_min_ps(__m512 __A, __m512 __B) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_min_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_min_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -1345,17 +1359,17 @@ _mm512_min_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_min_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_min_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1367,17 +1381,17 @@ _mm512_min_epu32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epu32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_min_epu32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epu32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_min_epu32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1389,17 +1403,17 @@ _mm512_min_epi64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_min_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_min_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1411,17 +1425,17 @@ _mm512_min_epu64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epu64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_min_epu64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epu64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_min_epu64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1433,17 +1447,17 @@ _mm512_mul_epi32(__m512i __X, __m512i __Y) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epi32(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epi32(__X, __Y), - (__v8di)_mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1455,17 +1469,17 @@ _mm512_mul_epu32(__m512i __X, __m512i __Y) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epu32(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epu32(__X, __Y), - (__v8di)_mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1477,17 +1491,17 @@ _mm512_mullo_epi32 (__m512i __A, __m512i __B) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_mullo_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_mullo_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1497,23 +1511,23 @@ _mm512_mullox_epi64 (__m512i __A, __m512i __B) { static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullox_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_mullox_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_sqrt_round_pd(A, R) \ ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) -#define _mm512_mask_sqrt_round_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_sqrt_round_pd(W, U, A, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_sqrt_round_pd(U, A, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_sqrt_round_pd(U, A, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_sqrt_pd(__m512d __A) @@ -1525,31 +1539,31 @@ _mm512_sqrt_pd(__m512d __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_sqrt_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_sqrt_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_sqrt_round_ps(A, R) \ ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) -#define _mm512_mask_sqrt_round_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_sqrt_round_ps(W, U, A, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_sqrt_round_ps(U, A, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_sqrt_round_ps(U, A, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_sqrt_ps(__m512 __A) @@ -1561,17 +1575,17 @@ _mm512_sqrt_ps(__m512 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -1870,17 +1884,17 @@ _mm512_abs_epi64(__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_abs_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_abs_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_abs_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_abs_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1892,17 +1906,17 @@ _mm512_abs_epi32(__m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_abs_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_abs_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_abs_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_abs_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -1966,59 +1980,59 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_add_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_add_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_add_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_add_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_add_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_add_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm512_add_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_add_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_add_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_add_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_add_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_add_round_ps(A, B, R) \ ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_add_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_add_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_add_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_add_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2081,59 +2095,59 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_sub_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_sub_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_sub_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_sub_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_sub_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_sub_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm512_sub_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_sub_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_sub_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_sub_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_sub_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_sub_round_ps(A, B, R) \ ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_sub_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_sub_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_sub_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_sub_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2196,59 +2210,59 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_mul_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_mul_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_mul_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_mul_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_mul_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_mul_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm512_mul_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_mul_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_mul_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_mul_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_mul_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_mul_round_ps(A, B, R) \ ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_mul_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_mul_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_mul_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_mul_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2318,16 +2332,16 @@ _mm512_div_pd(__m512d __a, __m512d __b) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_div_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_div_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_div_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_div_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -2338,45 +2352,45 @@ _mm512_div_ps(__m512 __a, __m512 __b) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_div_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_div_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm512_div_round_pd(A, B, R) \ ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(R))) -#define _mm512_mask_div_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_div_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_div_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_div_round_pd(U, A, B, R) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_div_round_ps(A, B, R) \ ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(R))) -#define _mm512_mask_div_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_div_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_div_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_div_round_ps(U, A, B, R) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_roundscale_ps(A, B) \ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ @@ -3272,27 +3286,27 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)__A); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)__I); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__I, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -3306,56 +3320,56 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)__A); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)__I); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__I, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_alignr_epi64(A, B, I) \ ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), (int)(I))) -#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_alignr_epi64(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_alignr_epi64(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_alignr_epi32(A, B, I) \ ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), (int)(I))) -#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_alignr_epi32(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_alignr_epi32(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, (U)))) /* Vector Extract */ #define _mm512_extractf64x4_pd(A, I) \ @@ -3393,33 +3407,29 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __W, - (__v8df) __A); + return (__m512d)__builtin_selectvector((__v8df)__W, (__v8df)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __W, - (__v16sf) __A); + return (__m512)__builtin_selectvector((__v16sf)__W, (__v16sf)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __W, - (__v8di) __A); + return (__m512i)__builtin_selectvector((__v8di)__W, (__v8di)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) { - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __W, - (__v16si) __A); + return (__m512i)__builtin_selectvector((__v16si)__W, (__v16si)__A, + __builtin_bit_cast(__vecmask16, __U)); } /* Compare */ @@ -3619,17 +3629,17 @@ _mm512_cvtepu32_ps (__m512i __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepu32_ps(__A), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepu32_ps(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512d __DEFAULT_FN_ATTRS512 @@ -3641,17 +3651,17 @@ _mm512_cvtepi32_pd(__m256i __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepi32_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepi32_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -3675,17 +3685,17 @@ _mm512_cvtepi32_ps (__m512i __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepi32_ps(__A), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepi32_ps(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512d __DEFAULT_FN_ATTRS512 @@ -3697,17 +3707,17 @@ _mm512_cvtepu32_pd(__m256i __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepu32_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepu32_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -4128,17 +4138,17 @@ _mm512_unpackhi_pd(__m512d __a, __m512d __b) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpackhi_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_unpackhi_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpackhi_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_unpackhi_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512d __DEFAULT_FN_ATTRS512 @@ -4151,17 +4161,17 @@ _mm512_unpacklo_pd(__m512d __a, __m512d __b) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpacklo_pd(__A, __B), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_unpacklo_pd(__A, __B), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpacklo_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_unpacklo_pd(__A, __B), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -4177,17 +4187,17 @@ _mm512_unpackhi_ps(__m512 __a, __m512 __b) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpackhi_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_unpackhi_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpackhi_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_unpackhi_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -4203,17 +4213,17 @@ _mm512_unpacklo_ps(__m512 __a, __m512 __b) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpacklo_ps(__A, __B), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_unpacklo_ps(__A, __B), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpacklo_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_unpacklo_ps(__A, __B), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4229,17 +4239,17 @@ _mm512_unpackhi_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpackhi_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_unpackhi_epi32(__A, __B), (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpackhi_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_unpackhi_epi32(__A, __B), (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4255,17 +4265,17 @@ _mm512_unpacklo_epi32(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpacklo_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_unpacklo_epi32(__A, __B), (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpacklo_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_unpacklo_epi32(__A, __B), (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4278,17 +4288,17 @@ _mm512_unpackhi_epi64(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpackhi_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_unpackhi_epi64(__A, __B), (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpackhi_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_unpackhi_epi64(__A, __B), (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4301,20 +4311,19 @@ _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpacklo_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_unpacklo_epi64(__A, __B), (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpacklo_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_unpacklo_epi64(__A, __B), (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } - /* SIMD load ops */ static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -4733,17 +4742,17 @@ _mm512_cvtepi8_epi32(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi8_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi8_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4757,17 +4766,17 @@ _mm512_cvtepi8_epi64(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi8_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi8_epi64(__A), - (__v8di)_mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4779,17 +4788,17 @@ _mm512_cvtepi32_epi64(__m256i __X) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi32_epi64(__X), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi32_epi64(__X), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4801,17 +4810,17 @@ _mm512_cvtepi16_epi32(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi16_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi16_epi32(__A), - (__v16si)_mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4823,17 +4832,17 @@ _mm512_cvtepi16_epi64(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi16_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi16_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4845,17 +4854,17 @@ _mm512_cvtepu8_epi32(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu8_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu8_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4867,17 +4876,17 @@ _mm512_cvtepu8_epi64(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu8_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu8_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4889,17 +4898,17 @@ _mm512_cvtepu32_epi64(__m256i __X) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu32_epi64(__X), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu32_epi64(__X), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4911,17 +4920,17 @@ _mm512_cvtepu16_epi32(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu16_epi32(__A), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu16_epi32(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4933,17 +4942,17 @@ _mm512_cvtepu16_epi64(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu16_epi64(__A), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu16_epi64(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4955,17 +4964,17 @@ _mm512_rorv_epi32 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rorv_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rorv_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -4977,21 +4986,19 @@ _mm512_rorv_epi64 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rorv_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rorv_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } - - #define _mm512_cmp_epi32_mask(a, b, p) \ ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ (__v16si)(__m512i)(b), (int)(p), \ @@ -5035,28 +5042,28 @@ _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) #define _mm512_rol_epi32(a, b) \ ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) -#define _mm512_mask_rol_epi32(W, U, a, b) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_rol_epi32(W, U, a, b) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_rol_epi32(U, a, b) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_rol_epi32(U, a, b) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_rol_epi64(a, b) \ ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) -#define _mm512_mask_rol_epi64(W, U, a, b) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_rol_epi64(W, U, a, b) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_rol_epi64(U, a, b) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_rol_epi64(U, a, b) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi32 (__m512i __A, __m512i __B) @@ -5067,17 +5074,17 @@ _mm512_rolv_epi32 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rolv_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rolv_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5089,44 +5096,44 @@ _mm512_rolv_epi64 (__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rolv_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rolv_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_ror_epi32(A, B) \ ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) -#define _mm512_mask_ror_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_ror_epi32(W, U, A, B) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_ror_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_ror_epi32(U, A, B) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_ror_epi64(A, B) \ ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) -#define _mm512_mask_ror_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_ror_epi64(W, U, A, B) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_ror_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_ror_epi64(U, A, B) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi32(__m512i __A, unsigned int __B) @@ -5138,16 +5145,16 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_slli_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_slli_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5159,17 +5166,17 @@ _mm512_slli_epi64(__m512i __A, unsigned int __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_slli_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_slli_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5182,16 +5189,16 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srli_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srli_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5204,18 +5211,18 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srli_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srli_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5245,33 +5252,31 @@ _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __A, - (__v16si) __W); + return (__m512i)__builtin_selectvector((__v16si)__A, (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __A, - (__v16si) _mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v16si)__A, + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __A, - (__v8di) __W); + return (__m512i)__builtin_selectvector((__v8di)__A, (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __A, - (__v8di) _mm512_setzero_si512 ()); + return (__m512i)__builtin_selectvector((__v8di)__A, + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5308,17 +5313,17 @@ _mm512_movedup_pd (__m512d __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_movedup_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_movedup_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_movedup_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_movedup_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ @@ -5688,17 +5693,17 @@ _mm512_sll_epi32(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sll_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sll_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5710,17 +5715,17 @@ _mm512_sll_epi64(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sll_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sll_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5732,17 +5737,17 @@ _mm512_sllv_epi32(__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sllv_epi32(__X, __Y), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sllv_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5754,17 +5759,17 @@ _mm512_sllv_epi64(__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sllv_epi64(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sllv_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5776,17 +5781,17 @@ _mm512_sra_epi32(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sra_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sra_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5798,17 +5803,17 @@ _mm512_sra_epi64(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sra_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sra_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5820,17 +5825,17 @@ _mm512_srav_epi32(__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srav_epi32(__X, __Y), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srav_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5842,17 +5847,17 @@ _mm512_srav_epi64(__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srav_epi64(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srav_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5864,17 +5869,17 @@ _mm512_srl_epi32(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srl_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srl_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5886,17 +5891,17 @@ _mm512_srl_epi64(__m512i __A, __m128i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srl_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srl_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5908,17 +5913,17 @@ _mm512_srlv_epi32(__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srlv_epi32(__X, __Y), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srlv_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5930,17 +5935,17 @@ _mm512_srlv_epi64 (__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srlv_epi64(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srlv_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } /// \enum _MM_TERNLOG_ENUM @@ -6162,28 +6167,28 @@ _mm_cvttss_u64 (__m128 __A) #define _mm512_permute_pd(X, C) \ ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) -#define _mm512_mask_permute_pd(W, U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_permute_pd(W, U, X, C) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_permute_pd(U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_permute_pd(U, X, C) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_permute_ps(X, C) \ ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) -#define _mm512_mask_permute_ps(W, U, X, C) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_permute_ps(W, U, X, C) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_permute_ps(U, X, C) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_permute_ps(U, X, C) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutevar_pd(__m512d __A, __m512i __C) @@ -6194,17 +6199,17 @@ _mm512_permutevar_pd(__m512d __A, __m512i __C) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutevar_pd(__A, __C), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutevar_pd(__A, __C), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -6216,17 +6221,17 @@ _mm512_permutevar_ps(__m512 __A, __m512i __C) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutevar_ps(__A, __C), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutevar_ps(__A, __C), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline __m512d __DEFAULT_FN_ATTRS512 @@ -6239,27 +6244,27 @@ _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)__A); + return (__m512d)__builtin_selectvector( + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)(__m512d)__I); + return (__m512d)__builtin_selectvector( + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)(__m512d)__I, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector( + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)_mm512_setzero_pd(), __builtin_bit_cast(__vecmask8, __U)); } static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -6272,28 +6277,27 @@ _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)__A); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)(__m512)__I); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)(__m512)__I, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)_mm512_setzero_ps(), __builtin_bit_cast(__vecmask16, __U)); } - #define _mm512_cvtt_roundpd_epu32(A, R) \ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ @@ -6627,17 +6631,16 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srai_epi32(__A, __B), - (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_srai_epi32(__A, __B), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srai_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); +_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_selectvector((__v16si)_mm512_srai_epi32(__A, __B), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -6649,102 +6652,102 @@ _mm512_srai_epi64(__m512i __A, unsigned int __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srai_epi64(__A, __B), - (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srai_epi64(__A, __B), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srai_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_srai_epi64(__A, __B), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm512_shuffle_f32x4(A, B, imm) \ ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(imm))) -#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_shuffle_f64x2(A, B, imm) \ ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(imm))) -#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_shuffle_i32x4(A, B, imm) \ ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), (int)(imm))) -#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_shuffle_i64x2(A, B, imm) \ ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), (int)(imm))) -#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_shuffle_pd(A, B, M) \ ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(M))) -#define _mm512_mask_shuffle_pd(W, U, A, B, M) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_shuffle_pd(W, U, A, B, M) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_shuffle_pd(U, A, B, M) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_shuffle_pd(U, A, B, M) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_shuffle_ps(A, B, M) \ ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(M))) -#define _mm512_mask_shuffle_ps(W, U, A, B, M) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_shuffle_ps(W, U, A, B, M) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shuffle_ps(U, A, B, M) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_shuffle_ps(U, A, B, M) \ + ((__m512)__builtin_selectvector((__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)_mm512_setzero_ps(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm_sqrt_round_sd(A, B, R) \ ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ @@ -6833,17 +6836,17 @@ _mm512_broadcast_f32x4(__m128 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x4(__A), - (__v16sf)__O); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x4(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -6856,17 +6859,17 @@ _mm512_broadcast_f64x4(__m256d __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x4(__A), - (__v8df)__O); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x4(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -6880,17 +6883,17 @@ _mm512_broadcast_i32x4(__m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x4(__A), - (__v16si)__O); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x4(__A), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -6903,49 +6906,49 @@ _mm512_broadcast_i64x4(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x4(__A), - (__v8di)__O); + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x4(__A), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector((__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_selectpd_512(__M, - (__v8df) _mm512_broadcastsd_pd(__A), - (__v8df) __O); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcastsd_pd(__A), + (__v8df)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_selectpd_512(__M, - (__v8df) _mm512_broadcastsd_pd(__A), - (__v8df) _mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_broadcastsd_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512(__M, - (__v16sf) _mm512_broadcastss_ps(__A), - (__v16sf) __O); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcastss_ps(__A), + (__v16sf)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_selectps_512(__M, - (__v16sf) _mm512_broadcastss_ps(__A), - (__v16sf) _mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_broadcastss_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS512 @@ -7420,57 +7423,57 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ (__v4df)(__m256d)(B), (int)(imm))) -#define _mm512_mask_insertf64x4(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_insertf64x4(W, U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_insertf64x4(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_insertf64x4(U, A, B, imm) \ + ((__m512d)__builtin_selectvector( \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_inserti64x4(A, B, imm) \ ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ (__v4di)(__m256i)(B), (int)(imm))) -#define _mm512_mask_inserti64x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_inserti64x4(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_inserti64x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_inserti64x4(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_insertf32x4(A, B, imm) \ ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ (__v4sf)(__m128)(B), (int)(imm))) -#define _mm512_mask_insertf32x4(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) +#define _mm512_mask_insertf32x4(W, U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), (__v16sf)(__m512)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_insertf32x4(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) +#define _mm512_maskz_insertf32x4(U, A, B, imm) \ + ((__m512)__builtin_selectvector( \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_inserti32x4(A, B, imm) \ ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ (__v4si)(__m128i)(B), (int)(imm))) -#define _mm512_mask_inserti32x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_inserti32x4(W, U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_inserti32x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_inserti32x4(U, A, B, imm) \ + ((__m512i)__builtin_selectvector( \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_getmant_round_pd(A, B, C, R) \ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ @@ -8246,28 +8249,28 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) #define _mm512_permutex_pd(X, C) \ ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) -#define _mm512_mask_permutex_pd(W, U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)(__m512d)(W))) +#define _mm512_mask_permutex_pd(W, U, X, C) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)(__m512d)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_permutex_pd(U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd())) +#define _mm512_maskz_permutex_pd(U, X, C) \ + ((__m512d)__builtin_selectvector((__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_permutex_epi64(X, C) \ ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) -#define _mm512_mask_permutex_epi64(W, U, X, C) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)(__m512i)(W))) +#define _mm512_mask_permutex_epi64(W, U, X, C) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)(__m512i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_permutex_epi64(U, X, C) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_permutex_epi64(U, X, C) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) @@ -8278,17 +8281,17 @@ _mm512_permutexvar_pd (__m512i __X, __m512d __Y) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutexvar_pd(__X, __Y), - (__v8df)__W); + return (__m512d)__builtin_selectvector( + (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutexvar_pd(__X, __Y), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector( + (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -8300,18 +8303,18 @@ _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_permutexvar_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_permutexvar_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_permutexvar_epi64(__X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_permutexvar_epi64(__X, __Y), (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -8323,17 +8326,17 @@ _mm512_permutexvar_ps (__m512i __X, __m512 __Y) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutexvar_ps(__X, __Y), - (__v16sf)__W); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutexvar_ps(__X, __Y), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector( + (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -8347,18 +8350,18 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_permutexvar_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_permutexvar_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_permutexvar_epi32(__X, __Y), - (__v16si)__W); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_permutexvar_epi32(__X, __Y), (__v16si)__W, + __builtin_bit_cast(__vecmask16, __M)); } #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 @@ -8670,17 +8673,17 @@ _mm512_movehdup_ps (__m512 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_movehdup_ps(__A), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_movehdup_ps(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_movehdup_ps(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_movehdup_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -8693,17 +8696,17 @@ _mm512_moveldup_ps (__m512 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_moveldup_ps(__A), - (__v16sf)__W); + return (__m512)__builtin_selectvector((__v16sf)_mm512_moveldup_ps(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_moveldup_ps(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512)__builtin_selectvector((__v16sf)_mm512_moveldup_ps(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -8783,15 +8786,15 @@ _mm_maskz_load_sd (__mmask8 __U, const double* __A) #define _mm512_shuffle_epi32(A, I) \ ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) -#define _mm512_mask_shuffle_epi32(W, U, A, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)(__m512i)(W))) +#define _mm512_mask_shuffle_epi32(W, U, A, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)(__m512i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shuffle_epi32(U, A, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_shuffle_epi32(U, A, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) @@ -8945,17 +8948,17 @@ _mm512_cvtps_pd (__m256 __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtps_pd(__A), - (__v8df)__W); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtps_pd(__A), + (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtps_pd(__A), - (__v8df)_mm512_setzero_pd()); + return (__m512d)__builtin_selectvector((__v8df)_mm512_cvtps_pd(__A), + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -8973,33 +8976,31 @@ _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) __W); + return (__m512d)__builtin_selectvector((__v8df)__A, (__v8df)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) { - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) _mm512_setzero_pd ()); + return (__m512d)__builtin_selectvector((__v8df)__A, + (__v8df)_mm512_setzero_pd(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) __W); + return (__m512)__builtin_selectvector((__v16sf)__A, (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) { - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) _mm512_setzero_ps ()); + return (__m512)__builtin_selectvector((__v16sf)__A, + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ void __DEFAULT_FN_ATTRS512 @@ -9186,17 +9187,17 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { - return (__m512i) __builtin_ia32_selectd_512(__M, - (__v16si) _mm512_set1_epi32(__A), - (__v16si) __O); + return (__m512i)__builtin_selectvector((__v16si)_mm512_set1_epi32(__A), + (__v16si)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { - return (__m512i) __builtin_ia32_selectq_512(__M, - (__v8di) _mm512_set1_epi64(__A), - (__v8di) __O); + return (__m512i)__builtin_selectvector((__v8di)_mm512_set1_epi64(__A), + (__v8di)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline __m512i __DEFAULT_FN_ATTRS512 diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 4123f10c39513..13cab2c4bc399 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -361,15 +361,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_add_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_add_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_add_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_add_round_ph(A, B, R) \ @@ -377,14 +378,14 @@ _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_add_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_add_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_add_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, __m512h __B) { @@ -393,15 +394,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_sub_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_sub_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_sub_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_sub_round_ph(A, B, R) \ @@ -409,14 +411,14 @@ _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_sub_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_sub_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_sub_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, __m512h __B) { @@ -425,15 +427,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_mul_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_mul_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_mul_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_mul_round_ph(A, B, R) \ @@ -441,14 +444,14 @@ _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_mul_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_mul_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_mul_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, __m512h __B) { @@ -457,15 +460,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_div_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_div_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_div_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_div_round_ph(A, B, R) \ @@ -473,14 +477,14 @@ _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_div_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_div_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_div_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, __m512h __B) { @@ -490,15 +494,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_min_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_min_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_min_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_min_round_ph(A, B, R) \ @@ -506,14 +511,14 @@ _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_min_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_min_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_min_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, __m512h __B) { @@ -523,15 +528,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_max_ph(__A, __B), + (__v32hf)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_max_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); + return (__m512h)__builtin_selectvector((__v32hf)_mm512_max_ph(__A, __B), + (__v32hf)_mm512_setzero_ph(), + __builtin_bit_cast(__vecmask32, __U)); } #define _mm512_max_round_ph(A, B, R) \ @@ -539,14 +545,14 @@ _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { (__v32hf)(__m512h)(B), (int)(R))) #define _mm512_mask_max_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_max_round_ph((A), (B), (R)), (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_max_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector( \ + (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); @@ -558,15 +564,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectps_512( - (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); + return (__m512h)__builtin_selectvector((__v16sf)_mm512_conj_pch(__A), + (__v16sf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_conj_pch(__A), - (__v16sf)_mm512_setzero_ps()); + return (__m512h)__builtin_selectvector((__v16sf)_mm512_conj_pch(__A), + (__v16sf)_mm512_setzero_ps(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, @@ -1372,14 +1379,14 @@ _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) #define _mm512_mask_sqrt_round_ph(W, U, A, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ - (__v32hf)(__m512h)(W))) + ((__m512h)__builtin_selectvector((__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)(__m512h)(W), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_maskz_sqrt_round_ph(U, A, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ - (__v32hf)_mm512_setzero_ph())) + ((__m512h)__builtin_selectvector((__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)_mm512_setzero_ph(), \ + __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, @@ -1388,18 +1395,16 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), + return (__m512h)__builtin_selectvector( (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)(__m512h)(__W)); + (__v32hf)(__m512h)(__W), __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), + return (__m512h)__builtin_selectvector( (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)_mm512_setzero_ph()); + (__v32hf)_mm512_setzero_ph(), __builtin_bit_cast(__vecmask32, __U)); } #define _mm_sqrt_round_sh(A, B, R) \ @@ -3293,8 +3298,8 @@ _mm512_reduce_min_ph(__m512h __V) { static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, - (__v32hf)__A); + return (__m512h)__builtin_selectvector((__v32hf)__W, (__v32hf)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h index 9468d17556e72..55cf1bda9606e 100644 --- a/clang/lib/Headers/avx512ifmaintrin.h +++ b/clang/lib/Headers/avx512ifmaintrin.h @@ -29,17 +29,17 @@ _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -52,17 +52,17 @@ _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), - (__v8di)__W); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), (__v8di)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask8, __M)); } #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h index 8787cd471d423..d2083c2c9db58 100644 --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -43,68 +43,67 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, __M)); } - #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vbmi2intrin.h b/clang/lib/Headers/avx512vbmi2intrin.h index 11598c888787c..bb72ffaa2c217 100644 --- a/clang/lib/Headers/avx512vbmi2intrin.h +++ b/clang/lib/Headers/avx512vbmi2intrin.h @@ -132,85 +132,85 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), (int)(I))) -#define _mm512_mask_shldi_epi64(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S))) +#define _mm512_mask_shldi_epi64(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_shldi_epi64(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_shldi_epi64(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_shldi_epi32(A, B, I) \ ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), (int)(I))) -#define _mm512_mask_shldi_epi32(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S))) +#define _mm512_mask_shldi_epi32(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shldi_epi32(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_shldi_epi32(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_shldi_epi16(A, B, I) \ ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ (__v32hi)(__m512i)(B), (int)(I))) -#define _mm512_mask_shldi_epi16(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S))) +#define _mm512_mask_shldi_epi16(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S), \ + __builtin_bit_cast(__vecmask32, (U)))) -#define _mm512_maskz_shldi_epi16(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512())) +#define _mm512_maskz_shldi_epi16(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm512_shrdi_epi64(A, B, I) \ ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), (int)(I))) -#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S))) +#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512())) +#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm512_shrdi_epi32(A, B, I) \ ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), (int)(I))) -#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S))) +#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512())) +#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm512_shrdi_epi16(A, B, I) \ ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ (__v32hi)(__m512i)(B), (int)(I))) -#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S))) +#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S), \ + __builtin_bit_cast(__vecmask32, (U)))) -#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512())) +#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ + ((__m512i)__builtin_selectvector((__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512(), \ + __builtin_bit_cast(__vecmask32, (U)))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) @@ -222,17 +222,17 @@ _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shldv_epi64(__A, __B, __C), - (__v8di)__A); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_shldv_epi64(__A, __B, __C), (__v8di)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shldv_epi64(__A, __B, __C), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_shldv_epi64(__A, __B, __C), (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -245,17 +245,17 @@ _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shldv_epi32(__A, __B, __C), - (__v16si)__A); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_shldv_epi32(__A, __B, __C), (__v16si)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shldv_epi32(__A, __B, __C), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_shldv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -268,17 +268,17 @@ _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shldv_epi16(__A, __B, __C), - (__v32hi)__A); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), (__v32hi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shldv_epi16(__A, __B, __C), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -291,17 +291,17 @@ _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shrdv_epi64(__A, __B, __C), - (__v8di)__A); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), (__v8di)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shrdv_epi64(__A, __B, __C), - (__v8di)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), (__v8di)_mm512_setzero_si512(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -314,17 +314,17 @@ _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shrdv_epi32(__A, __B, __C), - (__v16si)__A); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), (__v16si)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i) __builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shrdv_epi32(__A, __B, __C), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -337,21 +337,19 @@ _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), - (__v32hi)__A); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), (__v32hi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), - (__v32hi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask32, __U)); } - #undef __DEFAULT_FN_ATTRS #endif - diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index e47cd5caddaad..8cbc1a27a4f19 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -30,27 +30,27 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)__A); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)__I); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -63,18 +63,18 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_permutexvar_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_permutexvar_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_permutexvar_epi8(__A, __B), - (__v64qi)__W); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -87,20 +87,19 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), - (__v64qi)__W); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), (__v64qi)__W, + __builtin_bit_cast(__vecmask64, __M)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), - (__v64qi)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), + (__v64qi)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask64, __M)); } - #undef __DEFAULT_FN_ATTRS #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 848ca2d18c3ce..ace2bbfb9bf54 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -36,27 +36,27 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)__A); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)__I); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)_mm_setzero_si128(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -70,27 +70,27 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)__A); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)__I); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -102,18 +102,18 @@ _mm_permutexvar_epi8 (__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_permutexvar_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_permutexvar_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -126,18 +126,18 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_permutexvar_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_permutexvar_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_permutexvar_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -150,17 +150,17 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), - (__v16qi)__W); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), + (__v16qi)_mm_setzero_si128(), __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -173,20 +173,19 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), - (__v32qi)__W); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), + (__v32qi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask32, __M)); } - #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h index 89c9f49c7aed0..7143875f943dd 100644 --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -61,9 +61,9 @@ _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { /// conversion of __B, and higher 64 bits come from conversion of __A. static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, - (__v8bf)_mm_cvtne2ps_pbh(__A, __B), - (__v8bf)__W); + return (__m128bh)__builtin_selectvector((__v8bf)_mm_cvtne2ps_pbh(__A, __B), + (__v8bf)__W, + __builtin_bit_cast(__vecmask8, __U)); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -83,9 +83,9 @@ _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { /// conversion of __B, and higher 64 bits come from conversion of __A. static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, - (__v8bf)_mm_cvtne2ps_pbh(__A, __B), - (__v8bf)_mm_setzero_si128()); + return (__m128bh)__builtin_selectvector((__v8bf)_mm_cvtne2ps_pbh(__A, __B), + (__v8bf)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -125,9 +125,9 @@ _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { /// conversion of __B, and higher 128 bits come from conversion of __A. static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, - (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), - (__v16bf)__W); + return (__m256bh)__builtin_selectvector( + (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), (__v16bf)__W, + __builtin_bit_cast(__vecmask16, __U)); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -147,9 +147,9 @@ _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { /// conversion of __B, and higher 128 bits come from conversion of __A. static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, - (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), - (__v16bf)_mm256_setzero_si256()); + return (__m256bh)__builtin_selectvector( + (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), (__v16bf)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } /// Convert Packed Single Data to Packed BF16 Data. @@ -299,9 +299,9 @@ _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { /// __A, __B and __D static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_dpbf16_ps(__D, __A, __B), - (__v4sf)__D); + return (__m128)__builtin_selectvector((__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)__D, + __builtin_bit_cast(__vecmask4, __U)); } /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. @@ -323,9 +323,9 @@ _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { /// __A, __B and __D static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_dpbf16_ps(__D, __A, __B), - (__v4sf)_mm_setzero_si128()); + return (__m128)__builtin_selectvector((__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. @@ -368,9 +368,9 @@ _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { /// __A, __B and __D static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), - (__v8sf)__D); + return (__m256)__builtin_selectvector((__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)__D, + __builtin_bit_cast(__vecmask8, __U)); } /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. @@ -392,9 +392,9 @@ _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { /// __A, __B and __D static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), - (__v8sf)_mm256_setzero_si256()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } /// Convert One Single float Data to One BF16 Data. diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h index 377e3a5ea5713..546d8d89008ab 100644 --- a/clang/lib/Headers/avx512vlbitalgintrin.h +++ b/clang/lib/Headers/avx512vlbitalgintrin.h @@ -33,9 +33,9 @@ _mm256_popcnt_epi16(__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { - return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, - (__v16hi) _mm256_popcnt_epi16(__B), - (__v16hi) __A); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_popcnt_epi16(__B), + (__v16hi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -55,9 +55,9 @@ _mm_popcnt_epi16(__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { - return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, - (__v8hi) _mm_popcnt_epi16(__B), - (__v8hi) __A); + return (__m128i)__builtin_selectvector((__v8hi)_mm_popcnt_epi16(__B), + (__v8hi)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -77,9 +77,9 @@ _mm256_popcnt_epi8(__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { - return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, - (__v32qi) _mm256_popcnt_epi8(__B), - (__v32qi) __A); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_popcnt_epi8(__B), + (__v32qi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -99,9 +99,9 @@ _mm_popcnt_epi8(__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { - return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, - (__v16qi) _mm_popcnt_epi8(__B), - (__v16qi) __A); + return (__m128i)__builtin_selectvector((__v16qi)_mm_popcnt_epi8(__B), + (__v16qi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 9aedba0669991..e75147b838d46 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -308,973 +308,969 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_add_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_add_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_add_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_add_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_sub_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_sub_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sub_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sub_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_add_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_add_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_add_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_add_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_sub_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_sub_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sub_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sub_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mullo_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mullo_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mullo_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mullo_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __W, - (__v16qi) __A); + return (__m128i)__builtin_selectvector((__v16qi)__W, (__v16qi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __W, - (__v32qi) __A); + return (__m256i)__builtin_selectvector((__v32qi)__W, (__v32qi)__A, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __W, - (__v8hi) __A); + return (__m128i)__builtin_selectvector((__v8hi)__W, (__v8hi)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __W, - (__v16hi) __A); + return (__m256i)__builtin_selectvector((__v16hi)__W, (__v16hi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_abs_epi8(__A), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_abs_epi8(__A), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_abs_epi8(__A), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_abs_epi8(__A), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_abs_epi8(__A), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_abs_epi8(__A), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_abs_epi8(__A), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_abs_epi8(__A), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_abs_epi16(__A), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_abs_epi16(__A), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_abs_epi16(__A), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_abs_epi16(__A), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_abs_epi16(__A), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_abs_epi16(__A), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_abs_epi16(__A), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_abs_epi16(__A), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packs_epi32(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packs_epi32(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packs_epi32(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packs_epi32(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packs_epi16(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packs_epi16(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packs_epi16(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packs_epi16(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packus_epi32(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packus_epi32(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packus_epi32(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packus_epi32(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packus_epi16(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packus_epi16(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packus_epi16(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packus_epi16(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epu8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epu8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_avg_epu8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_avg_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_avg_epu8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_avg_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_avg_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_avg_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_avg_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_avg_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epu8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epu8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epu8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epu8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_shuffle_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_shuffle_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_shuffle_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_shuffle_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epu8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epu8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1288,27 +1284,27 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)__A); + return (__m128i)__builtin_selectvector( + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)__I); + return (__m128i)__builtin_selectvector( + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__I, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)_mm_setzero_si128(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -1322,84 +1318,84 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)__A); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)__I); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__I, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_maddubs_epi16(__X, __Y), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_maddubs_epi16(__X, __Y), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_maddubs_epi16(__X, __Y), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_maddubs_epi16(__X, __Y), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_maddubs_epi16(__X, __Y), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_maddubs_epi16(__X, __Y), (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_madd_epi16(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_madd_epi16(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_madd_epi16(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_madd_epi16(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_madd_epi16(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_madd_epi16(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1534,16 +1530,16 @@ _mm256_cvtepi16_epi8 (__m256i __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm256_cvtepi16_epi8(__A), - (__v16qi)__O); + return (__m128i)__builtin_selectvector((__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm256_cvtepi16_epi8(__A), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ void __DEFAULT_FN_ATTRS256 @@ -1566,307 +1562,304 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhrs_epi16(__X, __Y), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhrs_epi16(__X, __Y), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhrs_epi16(__X, __Y), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhrs_epi16(__X, __Y), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epu16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpackhi_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_unpackhi_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpackhi_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_unpackhi_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpackhi_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpackhi_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpackhi_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_unpackhi_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpackhi_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_unpackhi_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpackhi_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpackhi_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpacklo_epi8(__A, __B), - (__v16qi)__W); + return (__m128i)__builtin_selectvector((__v16qi)_mm_unpacklo_epi8(__A, __B), + (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpacklo_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_unpacklo_epi8(__A, __B), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpacklo_epi8(__A, __B), - (__v32qi)__W); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpacklo_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpacklo_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_unpacklo_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpacklo_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_unpacklo_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpacklo_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpacklo_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepi8_epi16(__A), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepi8_epi16(__A), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepi8_epi16(__A), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepi8_epi16(__A), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepu8_epi16(__A), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepu8_epi16(__A), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepu8_epi16(__A), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepu8_epi16(__A), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } +#define _mm_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W))) +#define _mm_maskz_shufflehi_epi16(U, A, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask8, __U))) -#define _mm_maskz_shufflehi_epi16(U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128())) +#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), (__v16hi)(__m256i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)(__m256i)(W))) +#define _mm256_maskz_shufflehi_epi16(U, A, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_maskz_shufflehi_epi16(U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)_mm256_setzero_si256())) +#define _mm_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W))) +#define _mm_maskz_shufflelo_epi16(U, A, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_maskz_shufflelo_epi16(U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128())) +#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_shufflelo_epi16((A), (imm)), (__v16hi)(__m256i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)(__m256i)(W))) - -#define _mm256_maskz_shufflelo_epi16(U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)_mm256_setzero_si256())) +#define _mm256_maskz_shufflelo_epi16(U, A, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_shufflelo_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, (U)))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi16(__m256i __A, __m256i __B) @@ -1877,17 +1870,17 @@ _mm256_sllv_epi16(__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sllv_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sllv_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1899,82 +1892,82 @@ _mm_sllv_epi16(__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sllv_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sllv_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sll_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sll_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sll_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sll_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_slli_epi16(__A, (int)__B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_slli_epi16(__A, (int)__B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_slli_epi16(__A, (int)__B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_slli_epi16(__A, (int)__B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_slli_epi16(__A, (int)__B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_slli_epi16(__A, (int)__B), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_slli_epi16(__A, (int)__B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_slli_epi16(__A, (int)__B), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -1986,17 +1979,17 @@ _mm256_srlv_epi16(__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srlv_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srlv_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2008,17 +2001,17 @@ _mm_srlv_epi16(__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srlv_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srlv_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -2030,17 +2023,17 @@ _mm256_srav_epi16(__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srav_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srav_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2052,243 +2045,238 @@ _mm_srav_epi16(__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srav_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srav_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sra_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sra_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sra_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sra_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srai_epi16(__A, (int)__B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srai_epi16(__A, (int)__B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srai_epi16(__A, (int)__B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srai_epi16(__A, (int)__B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srai_epi16(__A, (int)__B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_srai_epi16(__A, (int)__B), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srai_epi16(__A, (int)__B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_srai_epi16(__A, (int)__B), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srl_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srl_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srl_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srl_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srli_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srli_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srli_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srli_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __A, - (__v8hi) __W); + return (__m128i)__builtin_selectvector((__v8hi)__A, (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __A, - (__v8hi) _mm_setzero_si128 ()); + return (__m128i)__builtin_selectvector((__v8hi)__A, + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __A, - (__v16hi) __W); + return (__m256i)__builtin_selectvector((__v16hi)__A, (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __A, - (__v16hi) _mm256_setzero_si256 ()); + return (__m256i)__builtin_selectvector((__v16hi)__A, + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __A, - (__v16qi) __W); + return (__m128i)__builtin_selectvector((__v16qi)__A, (__v16qi)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __A, - (__v16qi) _mm_setzero_si128 ()); + return (__m128i)__builtin_selectvector((__v16qi)__A, + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __A, - (__v32qi) __W); + return (__m256i)__builtin_selectvector((__v32qi)__A, (__v32qi)__W, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __A, - (__v32qi) _mm256_setzero_si256 ()); + return (__m256i)__builtin_selectvector((__v32qi)__A, + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __U)); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_selectb_128(__M, - (__v16qi) _mm_set1_epi8(__A), - (__v16qi) __O); + return (__m128i)__builtin_selectvector((__v16qi)_mm_set1_epi8(__A), + (__v16qi)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_selectb_128(__M, - (__v16qi) _mm_set1_epi8(__A), - (__v16qi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_set1_epi8(__A), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_set1_epi8(__A), - (__v32qi) __O); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_set1_epi8(__A), + (__v32qi)__O, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_set1_epi8(__A), - (__v32qi) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_set1_epi8(__A), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline __m128i __DEFAULT_FN_ATTRS128 @@ -2621,97 +2609,97 @@ _mm256_movm_epi16 (__mmask16 __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectb_128(__M, - (__v16qi) _mm_broadcastb_epi8(__A), - (__v16qi) __O); + return (__m128i)__builtin_selectvector((__v16qi)_mm_broadcastb_epi8(__A), + (__v16qi)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectb_128(__M, - (__v16qi) _mm_broadcastb_epi8(__A), - (__v16qi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v16qi)_mm_broadcastb_epi8(__A), + (__v16qi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_broadcastb_epi8(__A), - (__v32qi) __O); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_broadcastb_epi8(__A), + (__v32qi)__O, + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_broadcastb_epi8(__A), - (__v32qi) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v32qi)_mm256_broadcastb_epi8(__A), + (__v32qi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask32, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128(__M, - (__v8hi) _mm_broadcastw_epi16(__A), - (__v8hi) __O); + return (__m128i)__builtin_selectvector((__v8hi)_mm_broadcastw_epi16(__A), + (__v8hi)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectw_128(__M, - (__v8hi) _mm_broadcastw_epi16(__A), - (__v8hi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_broadcastw_epi16(__A), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256(__M, - (__v16hi) _mm256_broadcastw_epi16(__A), - (__v16hi) __O); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_broadcastw_epi16(__A), + (__v16hi)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectw_256(__M, - (__v16hi) _mm256_broadcastw_epi16(__A), - (__v16hi) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_broadcastw_epi16(__A), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_selectw_256 (__M, - (__v16hi) _mm256_set1_epi16(__A), - (__v16hi) __O); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_set1_epi16(__A), + (__v16hi)__O, + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_selectw_256(__M, - (__v16hi)_mm256_set1_epi16(__A), - (__v16hi) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v16hi)_mm256_set1_epi16(__A), + (__v16hi)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_selectw_128(__M, - (__v8hi) _mm_set1_epi16(__A), - (__v8hi) __O); + return (__m128i)__builtin_selectvector((__v8hi)_mm_set1_epi16(__A), + (__v8hi)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_selectw_128(__M, - (__v8hi) _mm_set1_epi16(__A), - (__v8hi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_set1_epi16(__A), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2723,18 +2711,18 @@ _mm_permutexvar_epi16 (__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_permutexvar_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_permutexvar_epi16(__A, __B), - (__v8hi)__W); + return (__m128i)__builtin_selectvector( + (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -2747,67 +2735,67 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_permutexvar_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_permutexvar_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_permutexvar_epi16(__A, __B), - (__v16hi)__W); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_permutexvar_epi16(__A, __B), (__v16hi)__W, + __builtin_bit_cast(__vecmask16, __M)); } -#define _mm_mask_alignr_epi8(W, U, A, B, N) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)(__m128i)(W))) +#define _mm_mask_alignr_epi8(W, U, A, B, N) \ + ((__m128i)__builtin_selectvector( \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), (__v16qi)(__m128i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm_maskz_alignr_epi8(U, A, B, N) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)_mm_setzero_si128())) +#define _mm_maskz_alignr_epi8(U, A, B, N) \ + ((__m128i)__builtin_selectvector( \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ + (__v16qi)_mm_setzero_si128(), __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_mask_alignr_epi8(W, U, A, B, N) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)(__m256i)(W))) +#define _mm256_mask_alignr_epi8(W, U, A, B, N) \ + ((__m256i)__builtin_selectvector( \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), (__v32qi)(__m256i)(W), \ + __builtin_bit_cast(__vecmask32, __U))) -#define _mm256_maskz_alignr_epi8(U, A, B, N) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)_mm256_setzero_si256())) +#define _mm256_maskz_alignr_epi8(U, A, B, N) \ + ((__m256i)__builtin_selectvector( \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ + (__v32qi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask32, __U))) #define _mm_dbsad_epu8(A, B, imm) \ ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(imm))) -#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)(__m128i)(W))) +#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)(__m128i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)_mm_setzero_si128())) +#define _mm_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm256_dbsad_epu8(A, B, imm) \ ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ (__v32qi)(__m256i)(B), (int)(imm))) -#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)(__m256i)(W))) +#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), (__v16hi)(__m256i)(W), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)_mm256_setzero_si256())) +#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, (U)))) static __inline__ short __DEFAULT_FN_ATTRS128 _mm_reduce_add_epi16(__m128i __W) { diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h index 923e2c551a97a..217de347d83fa 100644 --- a/clang/lib/Headers/avx512vlcdintrin.h +++ b/clang/lib/Headers/avx512vlcdintrin.h @@ -57,17 +57,17 @@ _mm_conflict_epi64 (__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_conflict_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_conflict_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_conflict_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_conflict_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -79,17 +79,17 @@ _mm256_conflict_epi64 (__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_conflict_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_conflict_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_conflict_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_conflict_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -101,17 +101,17 @@ _mm_conflict_epi32 (__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_conflict_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_conflict_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_conflict_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_conflict_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -123,17 +123,17 @@ _mm256_conflict_epi32 (__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_conflict_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_conflict_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_conflict_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_conflict_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -145,17 +145,17 @@ _mm_lzcnt_epi32 (__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_lzcnt_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_lzcnt_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_lzcnt_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_lzcnt_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -167,17 +167,17 @@ _mm256_lzcnt_epi32 (__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_lzcnt_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_lzcnt_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -189,17 +189,17 @@ _mm_lzcnt_epi64 (__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_lzcnt_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_lzcnt_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_lzcnt_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_lzcnt_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -211,17 +211,17 @@ _mm256_lzcnt_epi64 (__m256i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_lzcnt_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_lzcnt_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 272cdd89e2d24..7e732b4dd8078 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -31,16 +31,16 @@ _mm256_mullo_epi64 (__m256i __A, __m256i __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_mullo_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_mullo_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -50,240 +50,240 @@ _mm_mullo_epi64 (__m128i __A, __m128i __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_mullo_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_mullo_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_andnot_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_andnot_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_andnot_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_andnot_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_andnot_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_andnot_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_andnot_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_andnot_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_andnot_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_andnot_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_and_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_and_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_and_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_and_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_and_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_and_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_and_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_and_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_and_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_and_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_and_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_and_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_and_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_and_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_xor_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_xor_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_xor_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_xor_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_xor_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_xor_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_xor_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_xor_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_xor_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_xor_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_xor_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_xor_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_or_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_or_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_or_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_or_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_or_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_or_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_or_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_or_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_or_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_or_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_or_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_or_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_or_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_or_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -461,16 +461,16 @@ _mm_cvtepi64_pd (__m128i __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepi64_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepi64_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepi64_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepi64_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 @@ -480,16 +480,16 @@ _mm256_cvtepi64_pd (__m256i __A) { static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepi64_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepi64_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -520,16 +520,16 @@ _mm256_cvtepi64_ps (__m256i __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepi64_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepi64_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -707,16 +707,16 @@ _mm_cvtepu64_pd (__m128i __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepu64_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepu64_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepu64_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepu64_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 @@ -726,16 +726,16 @@ _mm256_cvtepu64_pd (__m256i __A) { static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepu64_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepu64_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -766,16 +766,16 @@ _mm256_cvtepu64_ps (__m256i __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepu64_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepu64_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } #define _mm_range_pd(A, B, C) \ @@ -966,17 +966,17 @@ _mm256_broadcast_f32x2 (__m128 __A) static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x2(__A), - (__v8sf)__O); + return (__m256)__builtin_selectvector((__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x2(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 @@ -989,17 +989,17 @@ _mm256_broadcast_f64x2(__m128d __A) static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, - (__v4df)_mm256_broadcast_f64x2(__A), - (__v4df)__O); + return (__m256d)__builtin_selectvector((__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, - (__v4df)_mm256_broadcast_f64x2(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1012,17 +1012,17 @@ _mm_broadcast_i32x2 (__m128i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_broadcast_i32x2(__A), - (__v4si)__O); + return (__m128i)__builtin_selectvector((__v4si)_mm_broadcast_i32x2(__A), + (__v4si)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_broadcast_i32x2(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_broadcast_i32x2(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -1035,17 +1035,17 @@ _mm256_broadcast_i32x2 (__m128i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x2(__A), - (__v8si)__O); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x2(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -1058,17 +1058,17 @@ _mm256_broadcast_i64x2(__m128i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_broadcast_i64x2(__A), - (__v4di)__O); + return (__m256i)__builtin_selectvector((__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_broadcast_i64x2(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } #define _mm256_extractf64x2_pd(A, imm) \ @@ -1111,29 +1111,29 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ (__v2df)(__m128d)(B), (int)(imm))) -#define _mm256_mask_insertf64x2(W, U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W))) +#define _mm256_mask_insertf64x2(W, U, A, B, imm) \ + ((__m256d)__builtin_selectvector( \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), (__v4df)(__m256d)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_insertf64x2(U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd())) +#define _mm256_maskz_insertf64x2(U, A, B, imm) \ + ((__m256d)__builtin_selectvector( \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_inserti64x2(A, B, imm) \ ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ (__v2di)(__m128i)(B), (int)(imm))) -#define _mm256_mask_inserti64x2(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) +#define _mm256_mask_inserti64x2(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), (__v4di)(__m256i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_inserti64x2(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_inserti64x2(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, (U)))) #define _mm_mask_fpclass_pd_mask(U, A, imm) \ ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index a12acb7d9a24a..cebc0a3dd113d 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -88,14 +88,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_add_ph(__A, __B), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_add_ph(__A, __B), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, @@ -107,15 +109,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_add_ph(__A, __B), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_add_ph(__A, __B), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, @@ -125,14 +129,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_sub_ph(__A, __B), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_sub_ph(__A, __B), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, @@ -144,15 +150,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, @@ -162,14 +170,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_mul_ph(__A, __B), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_mul_ph(__A, __B), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, @@ -181,15 +191,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, @@ -199,14 +211,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_div_ph(__A, __B), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_div_ph(__A, __B), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, @@ -218,15 +232,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_div_ph(__A, __B), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_div_ph(__A, __B), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, @@ -236,18 +252,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)__W); + (__v16hf)__W, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, @@ -259,17 +273,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector( + (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector( + (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, @@ -279,18 +293,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)__W); + (__v16hf)__W, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, @@ -302,17 +314,17 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)__W); + return (__m128h)__builtin_selectvector( + (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector( + (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { @@ -329,14 +341,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectps_256( - (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); + return (__m256h)__builtin_selectvector((__v8sf)_mm256_conj_pch(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectps_256( - (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); + return (__m256h)__builtin_selectvector((__v8sf)_mm256_conj_pch(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { @@ -346,14 +360,15 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, __mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_selectps_128( - (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); + return (__m128h)__builtin_selectvector((__v4sf)_mm_conj_pch(__A), (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_selectps_128( - (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); + return (__m128h)__builtin_selectvector((__v4sf)_mm_conj_pch(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } #define _mm256_cmp_ph_mask(a, b, p) \ @@ -607,14 +622,15 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, __mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_sqrt_ph(__A), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { @@ -623,15 +639,16 @@ static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_sqrt_ph(__A), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_sqrt_ph(__A), - (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_sqrt_ph(__A), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } #define _mm_mask_fpclass_ph_mask(U, A, imm) \ @@ -796,14 +813,16 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_cvtepi16_ph(__A), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_cvtepi16_ph(__A), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -813,15 +832,16 @@ _mm256_cvtepi16_ph(__m256i __A) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_cvtepi16_ph(__A), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_cvtepi16_ph(__A), - (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_cvtepi16_ph(__A), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { @@ -900,14 +920,16 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm_cvtepu16_ph(__A), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm_cvtepu16_ph(__A), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -917,15 +939,16 @@ _mm256_cvtepu16_ph(__m256i __A) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_cvtepu16_ph(__A), + (__v16hf)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_cvtepu16_ph(__A), - (__v16hf)_mm256_setzero_ph()); + return (__m256h)__builtin_selectvector((__v16hf)_mm256_cvtepu16_ph(__A), + (__v16hf)_mm256_setzero_ph(), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { @@ -1022,14 +1045,16 @@ _mm256_cvtepi32_ph(__m256i __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm256_cvtepi32_ph(__A), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm256_cvtepi32_ph(__A), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { @@ -1056,14 +1081,16 @@ _mm256_cvtepu32_ph(__m256i __A) { static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); + return (__m128h)__builtin_selectvector((__v8hf)_mm256_cvtepu32_ph(__A), + (__v8hf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector((__v8hf)_mm256_cvtepu32_ph(__A), + (__v8hf)_mm_setzero_ph(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { @@ -1427,26 +1454,23 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); + (__v8hf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, @@ -1460,40 +1484,37 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); + return (__m128h)__builtin_selectvector( + _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), (__v8hf)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + return (__m128h)__builtin_selectvector( + _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, @@ -1505,26 +1526,23 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, @@ -1536,42 +1554,37 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, @@ -1583,26 +1596,23 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); + (__v8hf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, @@ -1614,18 +1624,16 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__A); + (__v8hf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)_mm_setzero_ph()); + (__v8hf)_mm_setzero_ph(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -1636,26 +1644,23 @@ _mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -1666,50 +1671,44 @@ _mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); + (__v16hf)_mm256_setzero_ph(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, @@ -1721,10 +1720,9 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); + (__v8hf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, @@ -1736,10 +1734,9 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, @@ -1751,18 +1748,16 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), - (__v8hf)__A); + (__v8hf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); + (__v8hf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, @@ -1774,18 +1769,16 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); + (__v16hf)__A, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); + (__v16hf)__C, __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, @@ -1833,11 +1826,10 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, (__v4sf)__C, (__mmask8)__U), - (__v4sf)__A); + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 @@ -1861,11 +1853,10 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectps_256( - __U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U), - (__v8sf)__A); + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -1927,11 +1918,10 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, + return (__m128h)__builtin_selectvector( __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U), - (__v4sf)__A); + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 @@ -1955,11 +1945,10 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectps_256( - __U, + return (__m256h)__builtin_selectvector( __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U), - (__v8sf)__A); + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 @@ -1977,14 +1966,14 @@ _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { - return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, - (__v8hf)__A); + return (__m128h)__builtin_selectvector((__v8hf)__W, (__v8hf)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, - (__v16hf)__A); + return (__m256h)__builtin_selectvector((__v16hf)__W, (__A), + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 2a5f7b43f63fc..8534c565e1a1f 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -232,225 +232,225 @@ typedef char __v2qi __attribute__((__vector_size__(2))); static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_add_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_add_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_add_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_add_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_add_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_add_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_add_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_add_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sub_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sub_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sub_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sub_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_add_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_add_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_add_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_add_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_add_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_add_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_add_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_add_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sub_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_sub_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sub_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_sub_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sub_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_sub_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sub_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_sub_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epi32(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epi32(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epi32(__X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epi32(__X, __Y), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epu32(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epu32(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epu32(__X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epu32(__X, __Y), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_mullo_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_mullo_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_mullo_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_mullo_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -462,9 +462,9 @@ _mm256_and_epi32(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_and_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_and_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -482,9 +482,9 @@ _mm_and_epi32(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_and_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_and_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -502,9 +502,9 @@ _mm256_andnot_epi32(__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_andnot_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_andnot_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -523,9 +523,9 @@ _mm_andnot_epi32(__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_andnot_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_andnot_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -543,9 +543,9 @@ _mm256_or_epi32(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_or_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_or_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -563,9 +563,9 @@ _mm_or_epi32(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_or_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_or_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -583,9 +583,9 @@ _mm256_xor_epi32(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_xor_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_xor_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -603,9 +603,9 @@ _mm_xor_epi32(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_xor_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_xor_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -623,9 +623,9 @@ _mm256_and_epi64(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_and_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_and_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -643,9 +643,9 @@ _mm_and_epi64(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_and_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_and_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -663,9 +663,9 @@ _mm256_andnot_epi64(__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_andnot_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_andnot_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -684,9 +684,9 @@ _mm_andnot_epi64(__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_andnot_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_andnot_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -704,9 +704,9 @@ _mm256_or_epi64(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_or_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_or_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -724,9 +724,9 @@ _mm_or_epi64(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_or_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_or_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -744,9 +744,9 @@ _mm256_xor_epi64(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_xor_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_xor_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -765,9 +765,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_xor_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_xor_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -899,834 +899,682 @@ _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)_mm_setzero_pd(), __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)_mm_setzero_ps(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - (__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, -(__v2df)__B, (__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - (__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, -(__v4df)__B, (__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, -(__v8sf)__B, (__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __A); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, -(__v2df)__B, -(__v2df)__C), + (__v2df)__A, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __C); + return (__m128d)__builtin_selectvector( + __builtin_ia32_vfmaddpd((__v2df)__A, -(__v2df)__B, -(__v2df)__C), + (__v2df)__C, __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __A); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, -(__v4df)__B, -(__v4df)__C), + (__v4df)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __C); + return (__m256d)__builtin_selectvector( + __builtin_ia32_vfmaddpd256((__v4df)__A, -(__v4df)__B, -(__v4df)__C), + (__v4df)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C), + (__v4sf)__A, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); + return (__m128)__builtin_selectvector( + __builtin_ia32_vfmaddps((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C), + (__v4sf)__C, __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C), + (__v8sf)__A, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); + return (__m256)__builtin_selectvector( + __builtin_ia32_vfmaddps256((__v8sf)__A, -(__v8sf)__B, -(__v8sf)__C), + (__v8sf)__C, __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_add_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_add_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_add_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_add_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_add_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_add_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_add_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_add_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_add_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_add_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_add_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_add_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_add_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_add_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __W, - (__v4si) __A); + return (__m128i)__builtin_selectvector((__v4si)__W, (__v4si)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __W, - (__v8si) __A); + return (__m256i)__builtin_selectvector((__v8si)__W, (__v8si)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) { - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __W, - (__v2df) __A); + return (__m128d)__builtin_selectvector((__v2df)__W, (__v2df)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) { - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __W, - (__v4df) __A); + return (__m256d)__builtin_selectvector((__v4df)__W, (__v4df)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) { - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __W, - (__v4sf) __A); + return (__m128)__builtin_selectvector((__v4sf)__W, (__v4sf)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) { - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __W, - (__v8sf) __A); +_mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) { + return (__m256)__builtin_selectvector((__v8sf)__W, (__v8sf)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __W, - (__v2di) __A); + return (__m128i)__builtin_selectvector((__v2di)__W, (__v2di)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __W, - (__v4di) __A); + return (__m256i)__builtin_selectvector((__v4di)__W, (__v4di)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -1907,58 +1755,58 @@ _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepi32_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepi32_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepi32_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepi32_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepi32_pd(__A), - (__v4df)__W); +_mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, __m128i __A) { + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepi32_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepi32_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepi32_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepi32_ps(__A), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepi32_ps(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -1978,16 +1826,16 @@ _mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvtpd_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvtpd_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -2007,16 +1855,16 @@ _mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtpd_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtpd_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2067,58 +1915,57 @@ _mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtps_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtps_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtps_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtps_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtps_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtps_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtps_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtps_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtps_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtps_pd(__A), (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtps_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtps_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtps_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtps_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtps_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtps_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2184,16 +2031,16 @@ _mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvttpd_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvttpd_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2244,30 +2091,30 @@ _mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvttps_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvttps_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvttps_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvttps_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvttps_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvttps_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvttps_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvttps_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2324,16 +2171,16 @@ _mm_cvtepu32_pd (__m128i __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepu32_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepu32_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepu32_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_cvtepu32_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 @@ -2343,16 +2190,16 @@ _mm256_cvtepu32_pd (__m128i __A) { static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepu32_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepu32_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -2362,16 +2209,16 @@ _mm_cvtepu32_ps (__m128i __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepu32_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepu32_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 @@ -2381,72 +2228,72 @@ _mm256_cvtepu32_ps (__m256i __A) { static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepu32_ps(__A), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepu32_ps(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_div_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_div_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_div_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_div_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_div_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_div_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_div_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_div_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_div_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_div_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_div_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_div_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_div_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_div_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -2798,198 +2645,198 @@ _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_max_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_max_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_max_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_max_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_max_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_max_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_max_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_max_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_max_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_max_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_max_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_max_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_max_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_max_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_min_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_min_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_min_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_min_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_min_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_min_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_min_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_min_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_min_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_min_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_min_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_min_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_min_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_min_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_mul_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_mul_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_mul_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_mul_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_mul_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_mul_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_mul_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_mul_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_mul_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_mul_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_mul_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_mul_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_abs_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_abs_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_abs_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_abs_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_abs_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_abs_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_abs_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_abs_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -2999,16 +2846,16 @@ _mm_abs_epi64 (__m128i __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_abs_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_abs_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_abs_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_abs_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3018,44 +2865,44 @@ _mm256_abs_epi64 (__m256i __A) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_abs_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_abs_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_abs_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_abs_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_max_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_max_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_max_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_max_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3065,16 +2912,16 @@ _mm_max_epi64 (__m128i __A, __m128i __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_max_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_max_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3084,44 +2931,44 @@ _mm256_max_epi64 (__m256i __A, __m256i __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_max_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_max_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epu32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_max_epu32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epu32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_max_epu32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epu32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_max_epu32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epu32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_max_epu32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3131,16 +2978,16 @@ _mm_max_epu64 (__m128i __A, __m128i __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epu64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_max_epu64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epu64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_max_epu64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3150,44 +2997,44 @@ _mm256_max_epu64 (__m256i __A, __m256i __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epu64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_max_epu64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epu64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_max_epu64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_min_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_min_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_min_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_min_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3197,16 +3044,16 @@ _mm_min_epi64 (__m128i __A, __m128i __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_min_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_min_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3216,44 +3063,44 @@ _mm256_min_epi64 (__m256i __A, __m256i __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_min_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_min_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epu32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_min_epu32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epu32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_min_epu32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epu32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_min_epu32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epu32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_min_epu32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3263,16 +3110,16 @@ _mm_min_epu64 (__m128i __A, __m128i __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epu64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_min_epu64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epu64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_min_epu64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3282,16 +3129,16 @@ _mm256_min_epu64 (__m256i __A, __m256i __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epu64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_min_epu64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epu64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_min_epu64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } #define _mm_roundscale_pd(A, imm) \ @@ -3637,114 +3484,113 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_sqrt_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_sqrt_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_sqrt_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_sqrt_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_sqrt_ps(__A), (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_sqrt_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_sub_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_sub_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_sub_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_sub_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3756,25 +3602,25 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)__A); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), (__v4si)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)__I); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), (__v4si)__I, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)_mm_setzero_si128(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -3786,25 +3632,25 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)__A); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), (__v8si)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)__I); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), (__v8si)__I, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -3815,23 +3661,23 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)__A); + return (__m128d)__builtin_selectvector( + (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)(__m128d)__I); + return (__m128d)__builtin_selectvector( + (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)(__m128d)__I, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector( + (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 @@ -3843,25 +3689,25 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)__A); + return (__m256d)__builtin_selectvector( + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), (__v4df)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)(__m256d)__I); + return (__m256d)__builtin_selectvector( + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), (__v4df)(__m256d)__I, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -3872,23 +3718,23 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)__A); + return (__m128)__builtin_selectvector(_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)(__m128)__I); + return (__m128)__builtin_selectvector(_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)(__m128)__I, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector(_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 @@ -3899,25 +3745,25 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)__A); + return (__m256)__builtin_selectvector(_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)(__m256)__I); + return (__m256)__builtin_selectvector(_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)(__m256)__I, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector(_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -3929,28 +3775,27 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)__A); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), (__v2di)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)__I); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), (__v2di)__I, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)_mm_setzero_si128(), __builtin_bit_cast(__vecmask2, __U)); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, @@ -3960,421 +3805,418 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)__A); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), (__v4di)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)__I); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), (__v4di)__I, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } - #define _mm_rol_epi32(a, b) \ ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))) -#define _mm_mask_rol_epi32(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)(__m128i)(w))) - -#define _mm_maskz_rol_epi32(u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_rol_epi32(a, b) \ - ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))) - -#define _mm256_mask_rol_epi32(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)(__m256i)(w))) - -#define _mm256_maskz_rol_epi32(u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_rol_epi64(a, b) \ - ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))) - -#define _mm_mask_rol_epi64(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)(__m128i)(w))) - -#define _mm_maskz_rol_epi64(u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_rol_epi64(a, b) \ - ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))) - -#define _mm256_mask_rol_epi64(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)(__m256i)(w))) - -#define _mm256_maskz_rol_epi64(u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256())) - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_rolv_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); -} +#define _mm_mask_rol_epi32(w, u, a, b) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)(__m128i)(w), \ + __builtin_bit_cast(__vecmask4, (u)))) + +#define _mm_maskz_rol_epi32(u, a, b) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (u)))) + +#define _mm256_rol_epi32(a, b) \ + ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))) + +#define _mm256_mask_rol_epi32(w, u, a, b) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)(__m256i)(w), \ + __builtin_bit_cast(__vecmask8, (u)))) + +#define _mm256_maskz_rol_epi32(u, a, b) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask8, (u)))) + +#define _mm_rol_epi64(a, b) \ + ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))) + +#define _mm_mask_rol_epi64(w, u, a, b) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)(__m128i)(w), \ + __builtin_bit_cast(__vecmask2, (u)))) + +#define _mm_maskz_rol_epi64(u, a, b) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask2, (u)))) + +#define _mm256_rol_epi64(a, b) \ + ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))) + +#define _mm256_mask_rol_epi64(w, u, a, b) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)(__m256i)(w), \ + __builtin_bit_cast(__vecmask4, (u)))) + +#define _mm256_maskz_rol_epi64(u, a, b) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask4, (u)))) + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi32(__m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); + } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rolv_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rolv_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4386,17 +4228,17 @@ _mm256_rolv_epi32 (__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rolv_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rolv_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4408,17 +4250,17 @@ _mm_rolv_epi64 (__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rolv_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rolv_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4430,197 +4272,197 @@ _mm256_rolv_epi64 (__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rolv_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rolv_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } #define _mm_ror_epi32(a, b) \ ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))) -#define _mm_mask_ror_epi32(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)(__m128i)(w))) +#define _mm_mask_ror_epi32(w, u, a, b) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)(__m128i)(w), \ + __builtin_bit_cast(__vecmask4, (u)))) -#define _mm_maskz_ror_epi32(u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128())) +#define _mm_maskz_ror_epi32(u, a, b) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (u)))) #define _mm256_ror_epi32(a, b) \ ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))) -#define _mm256_mask_ror_epi32(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)(__m256i)(w))) +#define _mm256_mask_ror_epi32(w, u, a, b) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)(__m256i)(w), \ + __builtin_bit_cast(__vecmask8, (u)))) -#define _mm256_maskz_ror_epi32(u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_ror_epi32(u, a, b) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask8, (u)))) #define _mm_ror_epi64(a, b) \ ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))) -#define _mm_mask_ror_epi64(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)(__m128i)(w))) +#define _mm_mask_ror_epi64(w, u, a, b) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)(__m128i)(w), \ + __builtin_bit_cast(__vecmask2, (u)))) -#define _mm_maskz_ror_epi64(u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128())) +#define _mm_maskz_ror_epi64(u, a, b) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask2, (u)))) #define _mm256_ror_epi64(a, b) \ ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))) -#define _mm256_mask_ror_epi64(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)(__m256i)(w))) +#define _mm256_mask_ror_epi64(w, u, a, b) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)(__m256i)(w), \ + __builtin_bit_cast(__vecmask4, (u)))) -#define _mm256_maskz_ror_epi64(u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_ror_epi64(u, a, b) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask4, (u)))) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sll_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_sll_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sll_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_sll_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sll_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sll_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_slli_epi32(__A, (int)__B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_slli_epi32(__A, (int)__B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_slli_epi32(__A, (int)__B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_slli_epi32(__A, (int)__B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_slli_epi32(__A, (int)__B), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_slli_epi32(__A, (int)__B), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_slli_epi32(__A, (int)__B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_slli_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sll_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_sll_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sll_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_sll_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sll_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sll_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_slli_epi64(__A, (int)__B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_slli_epi64(__A, (int)__B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_slli_epi64(__A, (int)__B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_slli_epi64(__A, (int)__B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_slli_epi64(__A, (int)__B), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_slli_epi64(__A, (int)__B), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_slli_epi64(__A, (int)__B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_slli_epi64(__A, (int)__B), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4632,17 +4474,17 @@ _mm_rorv_epi32 (__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rorv_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rorv_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4654,17 +4496,17 @@ _mm256_rorv_epi32 (__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rorv_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rorv_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -4676,17 +4518,17 @@ _mm_rorv_epi64 (__m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rorv_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rorv_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -4698,305 +4540,305 @@ _mm256_rorv_epi64 (__m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rorv_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rorv_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sllv_epi64(__X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sllv_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sllv_epi64(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sllv_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sllv_epi32(__X, __Y), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sllv_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sllv_epi32(__X, __Y), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sllv_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srlv_epi64(__X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srlv_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srlv_epi64(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srlv_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srlv_epi32(__X, __Y), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srlv_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srlv_epi32(__X, __Y), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srlv_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srl_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_srl_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srl_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_srl_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srl_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srl_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srli_epi32(__A, (int)__B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_srli_epi32(__A, (int)__B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srli_epi32(__A, (int)__B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_srli_epi32(__A, (int)__B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srli_epi32(__A, (int)__B), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_srli_epi32(__A, (int)__B), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srli_epi32(__A, (int)__B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_srli_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srl_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_srl_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srl_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_srl_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srl_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srl_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srli_epi64(__A, (int)__B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_srli_epi64(__A, (int)__B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srli_epi64(__A, (int)__B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_srli_epi64(__A, (int)__B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srli_epi64(__A, (int)__B), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_srli_epi64(__A, (int)__B), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srli_epi64(__A, (int)__B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_srli_epi64(__A, (int)__B), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srav_epi32(__X, __Y), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srav_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srav_epi32(__X, __Y), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srav_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -5008,17 +4850,17 @@ _mm_srav_epi64(__m128i __X, __m128i __Y) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srav_epi64(__X, __Y), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srav_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -5030,50 +4872,47 @@ _mm256_srav_epi64(__m256i __X, __m256i __Y) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srav_epi64(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srav_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __A, - (__v4si) __W); + return (__m128i)__builtin_selectvector((__v4si)__A, (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __A, - (__v4si) _mm_setzero_si128 ()); + return (__m128i)__builtin_selectvector((__v4si)__A, + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __A, - (__v8si) __W); + return (__m256i)__builtin_selectvector((__v8si)__A, (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __A, - (__v8si) _mm256_setzero_si256 ()); + return (__m256i)__builtin_selectvector((__v8si)__A, + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline __m128i __DEFAULT_FN_ATTRS128 @@ -5157,33 +4996,31 @@ _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __A, - (__v2di) __W); + return (__m128i)__builtin_selectvector((__v2di)__A, (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __A, - (__v2di) _mm_setzero_si128 ()); + return (__m128i)__builtin_selectvector((__v2di)__A, + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __A, - (__v4di) __W); + return (__m256i)__builtin_selectvector((__v4di)__A, (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __A, - (__v4di) _mm256_setzero_si256 ()); + return (__m256i)__builtin_selectvector((__v4di)__A, + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline __m128i __DEFAULT_FN_ATTRS128 @@ -5267,98 +5104,97 @@ _mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_movedup_pd(__A), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_movedup_pd(__A), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_movedup_pd(__A), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_movedup_pd(__A), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_movedup_pd(__A), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_movedup_pd(__A), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_movedup_pd(__A), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_movedup_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)__O); + return (__m128i)__builtin_selectvector((__v4si)_mm_set1_epi32(__A), + (__v4si)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi32( __mmask8 __M, int __A) { - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)__O); + return (__m256i)__builtin_selectvector((__v8si)_mm256_set1_epi32(__A), + (__v8si)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi32( __mmask8 __M, int __A) { - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_set1_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi64x(__A), - (__v2di) __O); + return (__m128i)__builtin_selectvector((__v2di)_mm_set1_epi64x(__A), + (__v2di)__O, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi64x(__A), - (__v2di) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_set1_epi64x(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_selectq_256(__M, - (__v4di) _mm256_set1_epi64x(__A), - (__v4di) __O) ; + return (__m256i)__builtin_selectvector((__v4di)_mm256_set1_epi64x(__A), + (__v4di)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_selectq_256(__M, - (__v4di) _mm256_set1_epi64x(__A), - (__v4di) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_set1_epi64x(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } #define _mm_fixupimm_pd(A, B, C, imm) \ @@ -5809,129 +5645,129 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpackhi_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_unpackhi_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpackhi_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_unpackhi_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpackhi_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_unpackhi_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpackhi_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_unpackhi_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpackhi_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_unpackhi_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpackhi_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_unpackhi_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpackhi_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_unpackhi_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpackhi_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_unpackhi_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpacklo_pd(__A, __B), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_unpacklo_pd(__A, __B), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpacklo_pd(__A, __B), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_unpacklo_pd(__A, __B), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpacklo_pd(__A, __B), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_unpacklo_pd(__A, __B), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpacklo_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_unpacklo_pd(__A, __B), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpacklo_ps(__A, __B), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_unpacklo_ps(__A, __B), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpacklo_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_unpacklo_ps(__A, __B), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpacklo_ps(__A, __B), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_unpacklo_ps(__A, __B), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpacklo_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_unpacklo_ps(__A, __B), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -6038,108 +5874,108 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) (__mmask8) __U); } -#define _mm_mask_permute_pd(W, U, X, C) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)(__m128d)(W))) +#define _mm_mask_permute_pd(W, U, X, C) \ + ((__m128d)__builtin_selectvector((__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)(__m128d)(W), \ + __builtin_bit_cast(__vecmask2, (U)))) -#define _mm_maskz_permute_pd(U, X, C) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)_mm_setzero_pd())) +#define _mm_maskz_permute_pd(U, X, C) \ + ((__m128d)__builtin_selectvector((__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)_mm_setzero_pd(), \ + __builtin_bit_cast(__vecmask2, (U)))) -#define _mm256_mask_permute_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)(__m256d)(W))) +#define _mm256_mask_permute_pd(W, U, X, C) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)(__m256d)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_permute_pd(U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd())) +#define _mm256_maskz_permute_pd(U, X, C) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd(), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_mask_permute_ps(W, U, X, C) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)(__m128)(W))) +#define _mm_mask_permute_ps(W, U, X, C) \ + ((__m128)__builtin_selectvector((__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)(__m128)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_maskz_permute_ps(U, X, C) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)_mm_setzero_ps())) +#define _mm_maskz_permute_ps(U, X, C) \ + ((__m128)__builtin_selectvector((__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)_mm_setzero_ps(), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_mask_permute_ps(W, U, X, C) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)(__m256)(W))) +#define _mm256_mask_permute_ps(W, U, X, C) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)(__m256)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_permute_ps(U, X, C) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)_mm256_setzero_ps())) +#define _mm256_maskz_permute_ps(U, X, C) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)_mm256_setzero_ps(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_permutevar_pd(__A, __C), - (__v2df)__W); + return (__m128d)__builtin_selectvector((__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_permutevar_pd(__A, __C), - (__v2df)_mm_setzero_pd()); + return (__m128d)__builtin_selectvector((__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutevar_pd(__A, __C), - (__v4df)__W); + return (__m256d)__builtin_selectvector((__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutevar_pd(__A, __C), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_permutevar_ps(__A, __C), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_permutevar_ps(__A, __C), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutevar_ps(__A, __C), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutevar_ps(__A, __C), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 @@ -6253,193 +6089,193 @@ _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpackhi_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_unpackhi_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpackhi_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_unpackhi_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpackhi_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_unpackhi_epi32(__A, __B), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpackhi_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_unpackhi_epi32(__A, __B), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpackhi_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_unpackhi_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpackhi_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_unpackhi_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpackhi_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_unpackhi_epi64(__A, __B), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpackhi_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_unpackhi_epi64(__A, __B), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpacklo_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_unpacklo_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpacklo_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_unpacklo_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpacklo_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_unpacklo_epi32(__A, __B), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpacklo_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_unpacklo_epi32(__A, __B), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpacklo_epi64(__A, __B), - (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_unpacklo_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpacklo_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_unpacklo_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpacklo_epi64(__A, __B), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_unpacklo_epi64(__A, __B), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpacklo_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_unpacklo_epi64(__A, __B), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sra_epi32(__A, __B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_sra_epi32(__A, __B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sra_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_sra_epi32(__A, __B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sra_epi32(__A, __B), - (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sra_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srai_epi32(__A, (int)__B), - (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_srai_epi32(__A, (int)__B), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srai_epi32(__A, (int)__B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_srai_epi32(__A, (int)__B), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srai_epi32(__A, (int)__B), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_srai_epi32(__A, (int)__B), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srai_epi32(__A, (int)__B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_srai_epi32(__A, (int)__B), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -6449,19 +6285,17 @@ _mm_sra_epi64(__m128i __A, __m128i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_sra_epi64(__A, __B), \ - (__v2di)__W); +_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_selectvector((__v2di)_mm_sra_epi64(__A, __B), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_sra_epi64(__A, __B), \ - (__v2di)_mm_setzero_si128()); +_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_selectvector((__v2di)_mm_sra_epi64(__A, __B), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -6471,19 +6305,17 @@ _mm256_sra_epi64(__m256i __A, __m128i __B) } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_sra_epi64(__A, __B), \ - (__v4di)__W); +_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_selectvector((__v4di)_mm256_sra_epi64(__A, __B), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_sra_epi64(__A, __B), \ - (__v4di)_mm256_setzero_si256()); +_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_selectvector((__v4di)_mm256_sra_epi64(__A, __B), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -6492,20 +6324,18 @@ _mm_srai_epi64(__m128i __A, unsigned int __imm) return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_srai_epi64(__A, __imm), \ - (__v2di)__W); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi64( + __m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) { + return (__m128i)__builtin_selectvector((__v2di)_mm_srai_epi64(__A, __imm), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_srai_epi64(__A, __imm), \ - (__v2di)_mm_setzero_si128()); +_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) { + return (__m128i)__builtin_selectvector((__v2di)_mm_srai_epi64(__A, __imm), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -6514,21 +6344,18 @@ _mm256_srai_epi64(__m256i __A, unsigned int __imm) return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, - unsigned int __imm) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_srai_epi64(__A, __imm), \ - (__v4di)__W); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi64( + __m256i __W, __mmask8 __U, __m256i __A, unsigned int __imm) { + return (__m256i)__builtin_selectvector((__v4di)_mm256_srai_epi64(__A, __imm), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_srai_epi64(__A, __imm), \ - (__v4di)_mm256_setzero_si256()); +_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) { + return (__m256i)__builtin_selectvector((__v4di)_mm256_srai_epi64(__A, __imm), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } #define _mm_ternarylogic_epi32(A, B, C, imm) \ @@ -6595,98 +6422,97 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), (int)(imm))) -#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W))) +#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ + ((__m256)__builtin_selectvector( \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), (__v8sf)(__m256)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps())) +#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ + ((__m256)__builtin_selectvector( \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm256_shuffle_f64x2(A, B, imm) \ ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), (int)(imm))) -#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W))) +#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ + ((__m256d)__builtin_selectvector( \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), (__v4df)(__m256d)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd())) +#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ + ((__m256d)__builtin_selectvector( \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd(), __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_shuffle_i32x4(A, B, imm) \ ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), (int)(imm))) -#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) +#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), (__v8si)(__m256i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm256_shuffle_i64x2(A, B, imm) \ ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), (int)(imm))) -#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) - - -#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) - -#define _mm_mask_shuffle_pd(W, U, A, B, M) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)(__m128d)(W))) - -#define _mm_maskz_shuffle_pd(U, A, B, M) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)_mm_setzero_pd())) - -#define _mm256_mask_shuffle_pd(W, U, A, B, M) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_shuffle_pd(U, A, B, M) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm_mask_shuffle_ps(W, U, A, B, M) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)(__m128)(W))) - -#define _mm_maskz_shuffle_ps(U, A, B, M) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)_mm_setzero_ps())) - -#define _mm256_mask_shuffle_ps(W, U, A, B, M) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)(__m256)(W))) - -#define _mm256_maskz_shuffle_ps(U, A, B, M) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)_mm256_setzero_ps())) +#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), (__v4di)(__m256i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm_mask_shuffle_pd(W, U, A, B, M) \ + ((__m128d)__builtin_selectvector((__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)(__m128d)(W), \ + __builtin_bit_cast(__vecmask2, (U)))) + +#define _mm_maskz_shuffle_pd(U, A, B, M) \ + ((__m128d)__builtin_selectvector((__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)_mm_setzero_pd(), \ + __builtin_bit_cast(__vecmask2, (U)))) + +#define _mm256_mask_shuffle_pd(W, U, A, B, M) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)(__m256d)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm256_maskz_shuffle_pd(U, A, B, M) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)_mm256_setzero_pd(), \ + __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm_mask_shuffle_ps(W, U, A, B, M) \ + ((__m128)__builtin_selectvector((__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)(__m128)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm_maskz_shuffle_ps(U, A, B, M) \ + ((__m128)__builtin_selectvector((__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)_mm_setzero_ps(), \ + __builtin_bit_cast(__vecmask4, (U)))) + +#define _mm256_mask_shuffle_ps(W, U, A, B, M) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)(__m256)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) + +#define _mm256_maskz_shuffle_ps(U, A, B, M) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)_mm256_setzero_ps(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rsqrt14_pd (__m128d __A) @@ -6802,17 +6628,17 @@ _mm256_broadcast_f32x4(__m128 __A) static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x4(__A), - (__v8sf)__O); + return (__m256)__builtin_selectvector((__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x4(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -6825,129 +6651,126 @@ _mm256_broadcast_i32x4(__m128i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x4(__A), - (__v8si)__O); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x4(__A), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_selectpd_256(__M, - (__v4df) _mm256_broadcastsd_pd(__A), - (__v4df) __O); + return (__m256d)__builtin_selectvector((__v4df)_mm256_broadcastsd_pd(__A), + (__v4df)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_selectpd_256(__M, - (__v4df) _mm256_broadcastsd_pd(__A), - (__v4df) _mm256_setzero_pd()); + return (__m256d)__builtin_selectvector((__v4df)_mm256_broadcastsd_pd(__A), + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128(__M, - (__v4sf) _mm_broadcastss_ps(__A), - (__v4sf) __O); + return (__m128)__builtin_selectvector((__v4sf)_mm_broadcastss_ps(__A), + (__v4sf)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128(__M, - (__v4sf) _mm_broadcastss_ps(__A), - (__v4sf) _mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_broadcastss_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256(__M, - (__v8sf) _mm256_broadcastss_ps(__A), - (__v8sf) __O); +_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) { + return (__m256)__builtin_selectvector(_mm256_broadcastss_ps(__A), (__v8sf)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256(__M, - (__v8sf) _mm256_broadcastss_ps(__A), - (__v8sf) _mm256_setzero_ps()); +_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { + return (__m256)__builtin_selectvector(_mm256_broadcastss_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_broadcastd_epi32(__A), - (__v4si) __O); + return (__m128i)__builtin_selectvector((__v4si)_mm_broadcastd_epi32(__A), + (__v4si)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_broadcastd_epi32(__A), - (__v4si) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_broadcastd_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_broadcastd_epi32(__A), - (__v8si) __O); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcastd_epi32(__A), + (__v8si)__O, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_broadcastd_epi32(__A), - (__v8si) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v8si)_mm256_broadcastd_epi32(__A), + (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di) _mm_broadcastq_epi64(__A), - (__v2di) __O); + return (__m128i)__builtin_selectvector((__v2di)_mm_broadcastq_epi64(__A), + (__v2di)__O, + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di) _mm_broadcastq_epi64(__A), - (__v2di) _mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_broadcastq_epi64(__A), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di) _mm256_broadcastq_epi64(__A), - (__v4di) __O); + return (__m256i)__builtin_selectvector((__v4di)_mm256_broadcastq_epi64(__A), + (__v4di)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di) _mm256_broadcastq_epi64(__A), - (__v4di) _mm256_setzero_si256()); + return (__m256i)__builtin_selectvector((__v4di)_mm256_broadcastq_epi64(__A), + (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -7747,17 +7570,17 @@ _mm256_cvtepi64_epi32 (__m256i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm256_cvtepi64_epi32(__A), - (__v4si)__O); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)__O, + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm256_cvtepi64_epi32(__A), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __M)); } static __inline__ void __DEFAULT_FN_ATTRS256 @@ -7865,29 +7688,29 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ (__v4sf)(__m128)(B), (int)(imm))) -#define _mm256_mask_insertf32x4(W, U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W))) +#define _mm256_mask_insertf32x4(W, U, A, B, imm) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_insertf32x4(U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps())) +#define _mm256_maskz_insertf32x4(U, A, B, imm) \ + ((__m256)__builtin_selectvector((__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm256_inserti32x4(A, B, imm) \ ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ (__v4si)(__m128i)(B), (int)(imm))) -#define _mm256_mask_inserti32x4(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) +#define _mm256_mask_inserti32x4(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), (__v8si)(__m256i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_inserti32x4(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_inserti32x4(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm_getmant_pd(A, B, C) \ ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ @@ -8060,28 +7883,28 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) #define _mm256_permutex_pd(X, C) \ ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))) -#define _mm256_mask_permutex_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)(__m256d)(W))) +#define _mm256_mask_permutex_pd(W, U, X, C) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_permutex_pd((X), (C)), \ + (__v4df)(__m256d)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_permutex_pd(U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd())) +#define _mm256_maskz_permutex_pd(U, X, C) \ + ((__m256d)__builtin_selectvector((__v4df)_mm256_permutex_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_permutex_epi64(X, C) \ ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))) -#define _mm256_mask_permutex_epi64(W, U, X, C) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)(__m256i)(W))) +#define _mm256_mask_permutex_epi64(W, U, X, C) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_permutex_epi64((X), (C)), \ + (__v4di)(__m256i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_permutex_epi64(U, X, C) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_permutex_epi64(U, X, C) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_permutex_epi64((X), (C)), \ + (__v4di)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask4, (U)))) static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_permutexvar_pd (__m256i __X, __m256d __Y) @@ -8093,17 +7916,17 @@ static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutexvar_pd(__X, __Y), - (__v4df)__W); + return (__m256d)__builtin_selectvector( + (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutexvar_pd(__X, __Y), - (__v4df)_mm256_setzero_pd()); + return (__m256d)__builtin_selectvector( + (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -8115,18 +7938,18 @@ _mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_permutexvar_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_permutexvar_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_permutexvar_epi64(__X, __Y), - (__v4di)__W); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_permutexvar_epi64(__X, __Y), (__v4di)__W, + __builtin_bit_cast(__vecmask4, __M)); } #define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) @@ -8134,17 +7957,17 @@ _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutexvar_ps(__X, __Y), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutexvar_ps(__X, __Y), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } #define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) @@ -8153,221 +7976,214 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_permutexvar_epi32(__X, __Y), - (__v8si)__W); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_permutexvar_epi32(__X, __Y), (__v8si)__W, + __builtin_bit_cast(__vecmask8, __M)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_permutexvar_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_permutexvar_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __M)); } #define _mm_alignr_epi32(A, B, imm) \ ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), (int)(imm))) -#define _mm_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)(__m128i)(W))) +#define _mm_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)(__m128i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_maskz_alignr_epi32(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)_mm_setzero_si128())) +#define _mm_maskz_alignr_epi32(U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_alignr_epi32(A, B, imm) \ ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), (int)(imm))) -#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) +#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), (__v8si)(__m256i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_alignr_epi32(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_alignr_epi32(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, (U)))) #define _mm_alignr_epi64(A, B, imm) \ ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), (int)(imm))) -#define _mm_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)(__m128i)(W))) +#define _mm_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)(__m128i)(W), \ + __builtin_bit_cast(__vecmask2, (U)))) -#define _mm_maskz_alignr_epi64(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)_mm_setzero_si128())) +#define _mm_maskz_alignr_epi64(U, A, B, imm) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask2, (U)))) #define _mm256_alignr_epi64(A, B, imm) \ ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), (int)(imm))) -#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) +#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), (__v4di)(__m256i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_alignr_epi64(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_alignr_epi64(U, A, B, imm) \ + ((__m256i)__builtin_selectvector( \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask4, (U)))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_movehdup_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_movehdup_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_movehdup_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_movehdup_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_movehdup_ps(__A), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_movehdup_ps(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_movehdup_ps(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_movehdup_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_moveldup_ps(__A), - (__v4sf)__W); + return (__m128)__builtin_selectvector((__v4sf)_mm_moveldup_ps(__A), + (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_moveldup_ps(__A), - (__v4sf)_mm_setzero_ps()); + return (__m128)__builtin_selectvector((__v4sf)_mm_moveldup_ps(__A), + (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_moveldup_ps(__A), - (__v8sf)__W); + return (__m256)__builtin_selectvector((__v8sf)_mm256_moveldup_ps(__A), + (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_moveldup_ps(__A), - (__v8sf)_mm256_setzero_ps()); + return (__m256)__builtin_selectvector((__v8sf)_mm256_moveldup_ps(__A), + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } -#define _mm256_mask_shuffle_epi32(W, U, A, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)(__m256i)(W))) +#define _mm256_mask_shuffle_epi32(W, U, A, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)(__m256i)(W), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_shuffle_epi32(U, A, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_shuffle_epi32(U, A, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_mask_shuffle_epi32(W, U, A, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)(__m128i)(W))) +#define _mm_mask_shuffle_epi32(W, U, A, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)(__m128i)(W), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_maskz_shuffle_epi32(U, A, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)_mm_setzero_si128())) +#define _mm_maskz_shuffle_epi32(U, A, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (U)))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) __W); + return (__m128d)__builtin_selectvector((__v2df)__A, (__v2df)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mov_pd (__mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) _mm_setzero_pd ()); + return (__m128d)__builtin_selectvector((__v2df)__A, (__v2df)_mm_setzero_pd(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) __W); + return (__m256d)__builtin_selectvector((__v4df)__A, (__v4df)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) _mm256_setzero_pd ()); + return (__m256d)__builtin_selectvector((__v4df)__A, + (__v4df)_mm256_setzero_pd(), + __builtin_bit_cast(__vecmask4, __U)); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) __W); +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mov_ps(__m128 __W, + __mmask8 __U, + __m128 __A) { + return (__m128)__builtin_selectvector((__v4sf)__A, (__v4sf)__W, + __builtin_bit_cast(__vecmask4, __U)); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps ()); +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mov_ps(__mmask8 __U, + __m128 __A) { + return (__m128)__builtin_selectvector((__v4sf)__A, (__v4sf)_mm_setzero_ps(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) __W); + return (__m256)__builtin_selectvector((__v8sf)__A, (__v8sf)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) _mm256_setzero_ps ()); + return (__m256)__builtin_selectvector((__v8sf)__A, + (__v8sf)_mm256_setzero_ps(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vlvbmi2intrin.h b/clang/lib/Headers/avx512vlvbmi2intrin.h index 77af2d5cbd2a0..8b43468e756f6 100644 --- a/clang/lib/Headers/avx512vlvbmi2intrin.h +++ b/clang/lib/Headers/avx512vlvbmi2intrin.h @@ -248,169 +248,169 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), (int)(I))) -#define _mm256_mask_shldi_epi64(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S))) +#define _mm256_mask_shldi_epi64(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_shldi_epi64(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_shldi_epi64(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm_shldi_epi64(A, B, I) \ ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), (int)(I))) -#define _mm_mask_shldi_epi64(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S))) +#define _mm_mask_shldi_epi64(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S), \ + __builtin_bit_cast(__vecmask2, (U)))) -#define _mm_maskz_shldi_epi64(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128())) +#define _mm_maskz_shldi_epi64(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask2, (U)))) #define _mm256_shldi_epi32(A, B, I) \ ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), (int)(I))) -#define _mm256_mask_shldi_epi32(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S))) +#define _mm256_mask_shldi_epi32(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_shldi_epi32(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_shldi_epi32(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm_shldi_epi32(A, B, I) \ ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), (int)(I))) -#define _mm_mask_shldi_epi32(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S))) +#define _mm_mask_shldi_epi32(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_maskz_shldi_epi32(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128())) +#define _mm_maskz_shldi_epi32(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_shldi_epi16(A, B, I) \ ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ (__v16hi)(__m256i)(B), (int)(I))) -#define _mm256_mask_shldi_epi16(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S))) +#define _mm256_mask_shldi_epi16(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_maskz_shldi_epi16(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256())) +#define _mm256_maskz_shldi_epi16(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm_shldi_epi16(A, B, I) \ ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ (__v8hi)(__m128i)(B), (int)(I))) -#define _mm_mask_shldi_epi16(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S))) +#define _mm_mask_shldi_epi16(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_maskz_shldi_epi16(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128())) +#define _mm_maskz_shldi_epi16(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm256_shrdi_epi64(A, B, I) \ ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), (int)(I))) -#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S))) +#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256())) +#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm_shrdi_epi64(A, B, I) \ ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), (int)(I))) -#define _mm_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S))) +#define _mm_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S), \ + __builtin_bit_cast(__vecmask2, (U)))) -#define _mm_maskz_shrdi_epi64(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128())) +#define _mm_maskz_shrdi_epi64(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask2, (U)))) #define _mm256_shrdi_epi32(A, B, I) \ ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), (int)(I))) -#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S))) +#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256())) +#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask8, (U)))) #define _mm_shrdi_epi32(A, B, I) \ ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), (int)(I))) -#define _mm_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S))) +#define _mm_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S), \ + __builtin_bit_cast(__vecmask4, (U)))) -#define _mm_maskz_shrdi_epi32(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128())) +#define _mm_maskz_shrdi_epi32(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask4, (U)))) #define _mm256_shrdi_epi16(A, B, I) \ ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ (__v16hi)(__m256i)(B), (int)(I))) -#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S))) +#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) -#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256())) +#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ + ((__m256i)__builtin_selectvector((__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256(), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm_shrdi_epi16(A, B, I) \ ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ (__v8hi)(__m128i)(B), (int)(I))) -#define _mm_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S))) +#define _mm_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S), \ + __builtin_bit_cast(__vecmask8, (U)))) -#define _mm_maskz_shrdi_epi16(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128())) +#define _mm_maskz_shrdi_epi16(U, A, B, I) \ + ((__m128i)__builtin_selectvector((__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128(), \ + __builtin_bit_cast(__vecmask8, (U)))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) @@ -422,17 +422,17 @@ _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shldv_epi64(__A, __B, __C), - (__v4di)__A); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_shldv_epi64(__A, __B, __C), (__v4di)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shldv_epi64(__A, __B, __C), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_shldv_epi64(__A, __B, __C), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -445,17 +445,17 @@ _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shldv_epi64(__A, __B, __C), - (__v2di)__A); + return (__m128i)__builtin_selectvector((__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shldv_epi64(__A, __B, __C), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -468,17 +468,17 @@ _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shldv_epi32(__A, __B, __C), - (__v8si)__A); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_shldv_epi32(__A, __B, __C), (__v8si)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shldv_epi32(__A, __B, __C), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_shldv_epi32(__A, __B, __C), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -491,17 +491,17 @@ _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shldv_epi32(__A, __B, __C), - (__v4si)__A); + return (__m128i)__builtin_selectvector((__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shldv_epi32(__A, __B, __C), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -514,17 +514,17 @@ _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shldv_epi16(__A, __B, __C), - (__v16hi)__A); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), (__v16hi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shldv_epi16(__A, __B, __C), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -537,17 +537,17 @@ _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shldv_epi16(__A, __B, __C), - (__v8hi)__A); + return (__m128i)__builtin_selectvector((__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shldv_epi16(__A, __B, __C), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -560,17 +560,17 @@ _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shrdv_epi64(__A, __B, __C), - (__v4di)__A); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), (__v4di)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shrdv_epi64(__A, __B, __C), - (__v4di)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), (__v4di)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -583,17 +583,17 @@ _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shrdv_epi64(__A, __B, __C), - (__v2di)__A); + return (__m128i)__builtin_selectvector((__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)__A, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shrdv_epi64(__A, __B, __C), - (__v2di)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -606,17 +606,17 @@ _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shrdv_epi32(__A, __B, __C), - (__v8si)__A); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), (__v8si)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shrdv_epi32(__A, __B, __C), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), (__v8si)_mm256_setzero_si256(), + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -629,17 +629,17 @@ _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shrdv_epi32(__A, __B, __C), - (__v4si)__A); + return (__m128i)__builtin_selectvector((__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)__A, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shrdv_epi32(__A, __B, __C), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -652,17 +652,17 @@ _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), - (__v16hi)__A); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), (__v16hi)__A, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), - (__v16hi)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -675,17 +675,17 @@ _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shrdv_epi16(__A, __B, __C), - (__v8hi)__A); + return (__m128i)__builtin_selectvector((__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)__A, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shrdv_epi16(__A, __B, __C), - (__v8hi)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector((__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask8, __U)); } diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index d1e5cd9d6983f..f6951eb5f0fa7 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -179,129 +179,129 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), - (__v8si)__S); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), (__v8si)__S, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), - (__v8si)__S); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), (__v8si)__S, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), - (__v8si)__S); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), (__v8si)__S, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), - (__v8si)__S); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), (__v8si)__S, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); + return (__m256i)__builtin_selectvector( + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256(), __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusd_epi32(__S, __A, __B), - (__v4si)__S); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), (__v4si)__S, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusd_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusds_epi32(__S, __A, __B), - (__v4si)__S); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), (__v4si)__S, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusds_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssd_epi32(__S, __A, __B), - (__v4si)__S); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), (__v4si)__S, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssd_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssds_epi32(__S, __A, __B), - (__v4si)__S); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), (__v4si)__S, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssds_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); + return (__m128i)__builtin_selectvector( + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128(), + __builtin_bit_cast(__vecmask4, __U)); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index 0fb381a12f2fd..8024c0be721ef 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -29,17 +29,17 @@ _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), - (__v16si)__S); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), (__v16si)__S, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -52,17 +52,17 @@ _mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), - (__v16si)__S); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), (__v16si)__S, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -75,17 +75,17 @@ _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), - (__v16si)__S); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), (__v16si)__S, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -98,17 +98,17 @@ _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), - (__v16si)__S); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), (__v16si)__S, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); + return (__m512i)__builtin_selectvector( + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512(), __builtin_bit_cast(__vecmask16, __U)); } #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vpopcntdqintrin.h b/clang/lib/Headers/avx512vpopcntdqintrin.h index e73e7e4f71313..f9bc395bbfd3b 100644 --- a/clang/lib/Headers/avx512vpopcntdqintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqintrin.h @@ -27,8 +27,9 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512( - (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W); + return (__m512i)__builtin_selectvector((__v8di)_mm512_popcnt_epi64(__A), + (__v8di)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -42,8 +43,9 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512( - (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W); + return (__m512i)__builtin_selectvector((__v16si)_mm512_popcnt_epi32(__A), + (__v16si)__W, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vpopcntdqvlintrin.h b/clang/lib/Headers/avx512vpopcntdqvlintrin.h index b2df2e84d3ed2..c8e74c7218338 100644 --- a/clang/lib/Headers/avx512vpopcntdqvlintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqvlintrin.h @@ -32,8 +32,9 @@ _mm_popcnt_epi64(__m128i __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128( - (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); + return (__m128i)__builtin_selectvector((__v2di)_mm_popcnt_epi64(__A), + (__v2di)__W, + __builtin_bit_cast(__vecmask2, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -48,8 +49,9 @@ _mm_popcnt_epi32(__m128i __A) { static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128( - (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); + return (__m128i)__builtin_selectvector((__v4si)_mm_popcnt_epi32(__A), + (__v4si)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -64,8 +66,9 @@ _mm256_popcnt_epi64(__m256i __A) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256( - (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); + return (__m256i)__builtin_selectvector((__v4di)_mm256_popcnt_epi64(__A), + (__v4di)__W, + __builtin_bit_cast(__vecmask4, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -80,8 +83,9 @@ _mm256_popcnt_epi32(__m256i __A) { static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256( - (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); + return (__m256i)__builtin_selectvector((__v8si)_mm256_popcnt_epi32(__A), + (__v8si)__W, + __builtin_bit_cast(__vecmask8, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h index 73b04a824aba8..128818fffc455 100644 --- a/clang/lib/Headers/gfniintrin.h +++ b/clang/lib/Headers/gfniintrin.h @@ -88,10 +88,10 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) (__v64qi)(__m512i)(B), \ (char)(I))) -#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S))) +#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector( \ + (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S), __builtin_bit_cast(__vecmask64, (U)))) #define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ @@ -102,10 +102,10 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) (__v64qi)(__m512i)(B), \ (char)(I))) -#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \ - (__v64qi)(__m512i)(S))) +#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m512i)__builtin_selectvector( \ + (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \ + (__v64qi)(__m512i)(S), __builtin_bit_cast(__vecmask64, (U)))) #define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ @@ -121,9 +121,9 @@ _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_selectb_512(__U, - (__v64qi) _mm512_gf2p8mul_epi8(__A, __B), - (__v64qi) __S); + return (__m512i)__builtin_selectvector( + (__v64qi)_mm512_gf2p8mul_epi8(__A, __B), (__v64qi)__S, + __builtin_bit_cast(__vecmask64, __U)); } static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK @@ -135,36 +135,36 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) #endif /* __AVX512BWINTRIN_H */ #ifdef __AVX512VLBWINTRIN_H -#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S))) +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector( \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), (__v16qi)(__m128i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ U, A, B, I) -#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S))) +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector( \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S), __builtin_bit_cast(__vecmask32, (U)))) #define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ U, A, B, I) -#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S))) +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m128i)__builtin_selectvector( \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), (__v16qi)(__m128i)(S), \ + __builtin_bit_cast(__vecmask16, (U)))) #define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I) -#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S))) +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m256i)__builtin_selectvector( \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), (__v32qi)(__m256i)(S), \ + __builtin_bit_cast(__vecmask32, (U)))) #define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ @@ -173,9 +173,9 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_selectb_128(__U, - (__v16qi) _mm_gf2p8mul_epi8(__A, __B), - (__v16qi) __S); + return (__m128i)__builtin_selectvector((__v16qi)_mm_gf2p8mul_epi8(__A, __B), + (__v16qi)__S, + __builtin_bit_cast(__vecmask16, __U)); } static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 @@ -188,9 +188,9 @@ _mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_selectb_256(__U, - (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), - (__v32qi) __S); + return (__m256i)__builtin_selectvector( + (__v32qi)_mm256_gf2p8mul_epi8(__A, __B), (__v32qi)__S, + __builtin_bit_cast(__vecmask32, __U)); } static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 @@ -208,4 +208,3 @@ _mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) #undef __DEFAULT_FN_ATTRS_VL256 #endif /* __GFNIINTRIN_H */ - diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 8e21811b67d90..1074defd0c986 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3013,6 +3013,62 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, break; } + case Builtin::BI__builtin_selectvector: { + if (checkArgCount(*this, TheCall, 3)) + return ExprError(); + + ExprResult LHS = TheCall->getArg(0); + ExprResult RHS = TheCall->getArg(1); + + QualType Result = UsualArithmeticConversions( + LHS, RHS, TheCall->getExprLoc(), ACK_Comparison); + + ExprResult Mask = UsualUnaryConversions(TheCall->getArg(2)); + + if (LHS.isInvalid() || RHS.isInvalid() || Mask.isInvalid()) + return ExprError(); + + QualType LHST = LHS.get()->getType(); + QualType RHST = RHS.get()->getType(); + QualType MaskT = Mask.get()->getType(); + + if (Result.isNull() || LHST.getCanonicalType() != RHST.getCanonicalType()) { + Diag(LHS.get()->getBeginLoc(), + diag::err_typecheck_call_different_arg_types) + << LHST << RHST; + return ExprError(); + } + + const auto *LHSVecT = LHST->getAs(); + const auto *MaskVecT = MaskT->getAs(); + + if (!LHSVecT) { + Diag(LHS.get()->getBeginLoc(), diag::err_builtin_invalid_arg_type) + << 1 << 4 << LHST; + return ExprError(); + } + + if (!MaskVecT || !MaskVecT->isExtVectorBoolType()) { + Diag(Mask.get()->getBeginLoc(), diag::err_builtin_invalid_arg_type) + << 3 << 9 << MaskT; + return ExprError(); + } + + if (LHSVecT->getNumElements() != MaskVecT->getNumElements()) { + Diag(LHS.get()->getBeginLoc(), + diag::err_typecheck_vector_lengths_not_equal) + << LHST << MaskT << LHS.get()->getSourceRange() + << Mask.get()->getSourceRange(); + return ExprError(); + } + + TheCall->setType(Result); + TheCall->setArg(0, LHS.get()); + TheCall->setArg(1, RHS.get()); + TheCall->setArg(2, Mask.get()); + break; + } + // __builtin_elementwise_abs restricts the element type to signed integers or // floating point types only. case Builtin::BI__builtin_elementwise_abs: { diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index b61c3eb3d54ad..38443cf620b5b 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -262,16 +262,16 @@ __m512d test_mm512_xor_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_xor_pd - // CHECK: xor <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: xor <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_xor_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_xor_pd - // CHECK: xor <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: xor <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_xor_pd(__U, __A, __B); } @@ -284,16 +284,16 @@ __m512 test_mm512_xor_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_xor_ps - // CHECK: xor <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: xor <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_xor_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_xor_ps - // CHECK: xor <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: xor <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_xor_ps(__U, __A, __B); } @@ -306,16 +306,16 @@ __m512d test_mm512_or_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_or_pd - // CHECK: or <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: or <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_or_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_or_pd - // CHECK: or <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: or <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_or_pd(__U, __A, __B); } @@ -328,16 +328,16 @@ __m512 test_mm512_or_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_or_ps - // CHECK: or <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: or <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_or_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_or_ps - // CHECK: or <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: or <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_or_ps(__U, __A, __B); } @@ -350,16 +350,16 @@ __m512d test_mm512_and_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_and_pd - // CHECK: and <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: and <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_and_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_and_pd - // CHECK: and <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: and <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_and_pd(__U, __A, __B); } @@ -372,16 +372,16 @@ __m512 test_mm512_and_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_and_ps - // CHECK: and <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: and <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_and_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_and_ps - // CHECK: and <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: and <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_and_ps(__U, __A, __B); } @@ -1520,4 +1520,3 @@ __mmask8 test_mm_mask_fpclass_ss_mask(__mmask8 __U, __m128 __A) { // CHECK: @llvm.x86.avx512.mask.fpclass.ss return _mm_mask_fpclass_ss_mask (__U, __A, 2); } - diff --git a/clang/test/CodeGen/X86/avx512f-builtins-constrained.c b/clang/test/CodeGen/X86/avx512f-builtins-constrained.c index 4044021a3f9e0..8c553af6c7ea6 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins-constrained.c +++ b/clang/test/CodeGen/X86/avx512f-builtins-constrained.c @@ -27,10 +27,10 @@ __m512d test_mm512_sqrt_pd(__m512d a) __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { // COMMON-LABEL: test_mm512_mask_sqrt_pd + // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // UNCONSTRAINED: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CONSTRAINED: call <8 x double> @llvm.experimental.constrained.sqrt.v8f64(<8 x double> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtpd - // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // COMMONIR: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_pd (__W,__U,__A); } @@ -38,10 +38,10 @@ __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { // COMMON-LABEL: test_mm512_maskz_sqrt_pd + // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // UNCONSTRAINED: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CONSTRAINED: call <8 x double> @llvm.experimental.constrained.sqrt.v8f64(<8 x double> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtpd - // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // COMMONIR: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_pd (__U,__A); } @@ -58,10 +58,10 @@ __m512 test_mm512_sqrt_ps(__m512 a) __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { // COMMON-LABEL: test_mm512_mask_sqrt_ps + // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // UNCONSTRAINED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.sqrt.v16f32(<16 x float> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtps - // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_ps( __W, __U, __A); } @@ -69,10 +69,10 @@ __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { // COMMON-LABEL: test_mm512_maskz_sqrt_ps + // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // UNCONSTRAINED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.sqrt.v16f32(<16 x float> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtps - // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_ps(__U ,__A); } @@ -206,4 +206,3 @@ __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_cvtph_ps (__U,__A); } - diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 0e3463849951e..1feb0fbea2022 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -13,8 +13,8 @@ __m512d test_mm512_sqrt_pd(__m512d a) __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_pd - // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_pd (__W,__U,__A); } @@ -22,8 +22,8 @@ __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_pd - // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_pd (__U,__A); } @@ -31,8 +31,8 @@ __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd - // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -40,8 +40,8 @@ __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A) __m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd - // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) // CHECK: bitcast i8 %{{.*}} to <8 x i1> + // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -63,8 +63,8 @@ __m512 test_mm512_sqrt_ps(__m512 a) __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_ps - // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_ps( __W, __U, __A); } @@ -72,8 +72,8 @@ __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_ps - // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_ps(__U ,__A); } @@ -81,8 +81,8 @@ __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps - // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -90,8 +90,8 @@ __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A) __m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps - // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) // CHECK: bitcast i16 %{{.*}} to <16 x i1> + // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -2709,96 +2709,96 @@ __mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) __m512i test_mm512_mask_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_and_epi32 - // CHECK: and <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: and <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_and_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_and_epi32 - // CHECK: and <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: and <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_and_epi32(__k,__a, __b); } __m512i test_mm512_mask_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_and_epi64 - // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_mask_and_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_and_epi64 - // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_maskz_and_epi64(__k,__a, __b); } __m512i test_mm512_mask_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_or_epi32 - // CHECK: or <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: or <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_or_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_or_epi32 - // CHECK: or <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: or <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_or_epi32(__k,__a, __b); } __m512i test_mm512_mask_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_or_epi64 - // CHECK: %[[OR_RES:.*]] = or <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[OR_RES:.*]] = or <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}} return _mm512_mask_or_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_or_epi64 - // CHECK: %[[OR_RES:.*]] = or <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[OR_RES:.*]] = or <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}} return _mm512_maskz_or_epi64(__k,__a, __b); } __m512i test_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_xor_epi32 - // CHECK: xor <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: xor <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_xor_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_xor_epi32 - // CHECK: xor <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: xor <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_xor_epi32(__k,__a, __b); } __m512i test_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_xor_epi64 - // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}} return _mm512_mask_xor_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_xor_epi64 - // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}} return _mm512_maskz_xor_epi64(__k,__a, __b); } @@ -10769,8 +10769,8 @@ __m512d test_mm512_abs_pd(__m512d a){ __m512d test_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A){ // CHECK-LABEL: @test_mm512_mask_abs_pd - // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %[[AND_RES:.*]] = and <8 x i64> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_mask_abs_pd (__W,__U,__A); } @@ -10783,8 +10783,8 @@ __m512 test_mm512_abs_ps(__m512 a){ __m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){ // CHECK-LABEL: @test_mm512_mask_abs_ps - // CHECK: and <16 x i32> // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: and <16 x i32> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_abs_ps( __W, __U, __A); } diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index a766476ca92bd..27cd2c18dc373 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -701,6 +701,7 @@ __m512h test_mm512_conj_pch(__m512h __A) { __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> @@ -708,7 +709,6 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> - // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> return _mm512_mask_conj_pch(__W, __U, __A); @@ -717,13 +717,13 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { __m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> - // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> return _mm512_maskz_conj_pch(__U, __A); @@ -2052,16 +2052,16 @@ __m512h test_mm512_sqrt_round_ph(__m512h __A) { __m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph - // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } __m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph - // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -2073,15 +2073,15 @@ __m512h test_mm512_sqrt_ph(__m512h __A) { } __m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_ph - // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} return _mm512_mask_sqrt_ph(__W, __U, __A); } __m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_ph - // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} return _mm512_maskz_sqrt_ph(__U, __A); } diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 6f544c21e798d..484001df48b80 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -2836,210 +2836,210 @@ __mmask8 test_mm_mask_cmp_pd_mask_true_us(__mmask8 m, __m128d a, __m128d b) { __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmadd_pd(__A, __B, __C, __U); } __m128d test_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmadd_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmsub_pd(__U, __A, __B, __C); } __m128 test_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmadd_ps(__A, __B, __C, __U); } __m128 test_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmadd_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmsub_ps(__U, __A, __B, __C); } @@ -3108,90 +3108,90 @@ __m256 test_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 _ __m128d test_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmaddsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsubadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmaddsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsubadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmaddsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsubadd_pd(__U, __A, __B, __C); } @@ -3199,44 +3199,44 @@ __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m __m128 test_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_ps // CHECK-NOT: fneg + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmaddsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsubadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmaddsub_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsubadd_ps(__U, __A, __B, __C); } @@ -3283,27 +3283,27 @@ __m256 test_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __m128d test_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsub_ps(__A, __B, __C, __U); } @@ -3318,27 +3318,27 @@ __m256 test_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __ __m128d test_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsubadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.+}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsubadd_ps(__A, __B, __C, __U); } @@ -3353,27 +3353,27 @@ __m256 test_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __m128d test_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmadd_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmadd_pd(__A, __U, __B, __C); } __m128 test_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmadd_ps(__A, __U, __B, __C); } @@ -3388,60 +3388,60 @@ __m256 test_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __ __m128d test_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmsub_pd + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_ps + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmsub_ps(__A, __B, __C, __U); } @@ -6668,22 +6668,22 @@ __m256d test_mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { __m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) { // CHECK-LABEL: @test_mm_mask_set1_epi32 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0 // CHECK: insertelement <4 x i32> %{{.*}}32 1 // CHECK: insertelement <4 x i32> %{{.*}}32 2 // CHECK: insertelement <4 x i32> %{{.*}}32 3 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_set1_epi32(__O, __M, 5); } __m128i test_mm_maskz_set1_epi32(__mmask8 __M) { // CHECK-LABEL: @test_mm_maskz_set1_epi32 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0 // CHECK: insertelement <4 x i32> %{{.*}}32 1 // CHECK: insertelement <4 x i32> %{{.*}}32 2 // CHECK: insertelement <4 x i32> %{{.*}}32 3 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_set1_epi32(__M, 5); } @@ -6718,40 +6718,40 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_mask_set1_epi64 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_set1_epi64(__O, __M, __A); } __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_maskz_set1_epi64 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_set1_epi64(__M, __A); } __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm256_mask_set1_epi64 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_set1_epi64(__O, __M, __A); } __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm256_maskz_set1_epi64 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_set1_epi64(__M, __A); } @@ -7108,14 +7108,14 @@ __m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_unpackhi_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); } __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_unpackhi_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_unpackhi_pd(__U, __A, __B); } @@ -7931,16 +7931,16 @@ __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } @@ -7973,16 +7973,16 @@ __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); } diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c index 3a212ed683437..7d41015a3a729 100644 --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -419,6 +419,7 @@ __m256h test_mm256_conj_pch(__m256h __A) { __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { // CHECK-LABEL: @test_mm256_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> @@ -426,7 +427,6 @@ __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> return _mm256_mask_conj_pch(__W, __U, __A); @@ -435,13 +435,13 @@ __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { __m256h test_mm256_maskz_conj_pch(__mmask32 __U, __m256h __A) { // CHECK-LABEL: @test_mm256_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> return _mm256_maskz_conj_pch(__U, __A); @@ -461,6 +461,8 @@ __m128h test_mm_conj_pch(__m128h __A) { __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { // CHECK-LABEL: @test_mm_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> @@ -468,8 +470,6 @@ __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_mask_conj_pch(__W, __U, __A); @@ -478,14 +478,14 @@ __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { __m128h test_mm_maskz_conj_pch(__mmask32 __U, __m128h __A) { // CHECK-LABEL: @test_mm_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_maskz_conj_pch(__U, __A); @@ -2844,8 +2844,8 @@ __m128h test_mm_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C) { __m128h test_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { // CHECK-LABEL: @test_mm_mask_fcmadd_pch + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.128 - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fcmadd_pch(__A, __U, __B, __C); } @@ -2933,8 +2933,8 @@ __m128h test_mm_fmadd_pch(__m128h __A, __m128h __B, __m128h __C) { __m128h test_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { // CHECK-LABEL: @test_mm_mask_fmadd_pch + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.128 - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmadd_pch(__A, __U, __B, __C); } diff --git a/clang/test/Sema/builtin-selectvector.c b/clang/test/Sema/builtin-selectvector.c new file mode 100644 index 0000000000000..4ba44bbc6c60a --- /dev/null +++ b/clang/test/Sema/builtin-selectvector.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -std=c99 %s -pedantic -verify -triple=x86_64-apple-darwin9 + +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double4 __attribute__((ext_vector_type(4))); + +typedef _Bool bool2 __attribute__((ext_vector_type(2))); +typedef _Bool bool4 __attribute__((ext_vector_type(4))); + +void test(bool2 vec_bool2, bool4 vec_bool4, double2 vec_double2, double4 vec_double4) { + __builtin_selectvector(); // expected-error {{too few arguments to function call, expected 3, have 0}} + (void)__builtin_selectvector(0, 0, 0); // expected-error {{1st argument must be a vector type (was 'int')}} + (void)__builtin_selectvector(vec_double2, 0, 0); // expected-error {{arguments are of different types ('double2' (vector of 2 'double' values) vs 'int')}} + (void)__builtin_selectvector(vec_double2, vec_double2, 0); // expected-error {{3rd argument must be a vector of bools (was 'int')}} + (void)__builtin_selectvector(vec_double2, vec_double2, vec_double2); // expected-error {{3rd argument must be a vector of bools (was 'double2' (vector of 2 'double' values))}} + (void)__builtin_selectvector(vec_double2, vec_double4, vec_bool2); // expected-error {{arguments are of different types ('double2' (vector of 2 'double' values) vs 'double4' (vector of 4 'double' values))}} + (void)__builtin_selectvector(vec_double2, vec_double2, vec_bool4); // expected-error {{vector operands do not have the same number of elements ('double2' (vector of 2 'double' values) and 'bool4' (vector of 4 '_Bool' values))}} + (void)__builtin_selectvector(vec_double2, vec_double2, vec_bool2); +} From b310cbf8f4b34c53c6f9a688ce9cc992001dcad3 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 16 May 2024 08:01:13 +0200 Subject: [PATCH 2/2] Revert test changes --- clang/test/CodeGen/X86/avx512dq-builtins.c | 25 ++-- .../X86/avx512f-builtins-constrained.c | 9 +- clang/test/CodeGen/X86/avx512f-builtins.c | 44 +++--- clang/test/CodeGen/X86/avx512fp16-builtins.c | 12 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 132 +++++++++--------- .../test/CodeGen/X86/avx512vlfp16-builtins.c | 16 +-- 6 files changed, 120 insertions(+), 118 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 38443cf620b5b..b61c3eb3d54ad 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -262,16 +262,16 @@ __m512d test_mm512_xor_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_xor_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: xor <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_xor_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_xor_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: xor <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_xor_pd(__U, __A, __B); } @@ -284,16 +284,16 @@ __m512 test_mm512_xor_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_xor_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: xor <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_xor_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_xor_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: xor <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_xor_ps(__U, __A, __B); } @@ -306,16 +306,16 @@ __m512d test_mm512_or_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_or_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: or <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_or_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_or_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: or <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_or_pd(__U, __A, __B); } @@ -328,16 +328,16 @@ __m512 test_mm512_or_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_or_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: or <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_or_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_or_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: or <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_or_ps(__U, __A, __B); } @@ -350,16 +350,16 @@ __m512d test_mm512_and_pd (__m512d __A, __m512d __B) { __m512d test_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_mask_and_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: and <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_mask_and_pd(__W, __U, __A, __B); } __m512d test_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: @test_mm512_maskz_and_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: and <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x double> %{{.*}}, <8 x double> %{{.*}} return (__m512d) _mm512_maskz_and_pd(__U, __A, __B); } @@ -372,16 +372,16 @@ __m512 test_mm512_and_ps (__m512 __A, __m512 __B) { __m512 test_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_mask_and_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: and <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_mask_and_ps(__W, __U, __A, __B); } __m512 test_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: @test_mm512_maskz_and_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: and <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x float> %{{.*}}, <16 x float> %{{.*}} return (__m512) _mm512_maskz_and_ps(__U, __A, __B); } @@ -1520,3 +1520,4 @@ __mmask8 test_mm_mask_fpclass_ss_mask(__mmask8 __U, __m128 __A) { // CHECK: @llvm.x86.avx512.mask.fpclass.ss return _mm_mask_fpclass_ss_mask (__U, __A, 2); } + diff --git a/clang/test/CodeGen/X86/avx512f-builtins-constrained.c b/clang/test/CodeGen/X86/avx512f-builtins-constrained.c index 8c553af6c7ea6..4044021a3f9e0 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins-constrained.c +++ b/clang/test/CodeGen/X86/avx512f-builtins-constrained.c @@ -27,10 +27,10 @@ __m512d test_mm512_sqrt_pd(__m512d a) __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { // COMMON-LABEL: test_mm512_mask_sqrt_pd - // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // UNCONSTRAINED: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CONSTRAINED: call <8 x double> @llvm.experimental.constrained.sqrt.v8f64(<8 x double> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtpd + // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // COMMONIR: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_pd (__W,__U,__A); } @@ -38,10 +38,10 @@ __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { // COMMON-LABEL: test_mm512_maskz_sqrt_pd - // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // UNCONSTRAINED: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) // CONSTRAINED: call <8 x double> @llvm.experimental.constrained.sqrt.v8f64(<8 x double> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtpd + // COMMONIR: bitcast i8 %{{.*}} to <8 x i1> // COMMONIR: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_pd (__U,__A); } @@ -58,10 +58,10 @@ __m512 test_mm512_sqrt_ps(__m512 a) __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { // COMMON-LABEL: test_mm512_mask_sqrt_ps - // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // UNCONSTRAINED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.sqrt.v16f32(<16 x float> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtps + // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_ps( __W, __U, __A); } @@ -69,10 +69,10 @@ __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { // COMMON-LABEL: test_mm512_maskz_sqrt_ps - // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // UNCONSTRAINED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) // CONSTRAINED: call <16 x float> @llvm.experimental.constrained.sqrt.v16f32(<16 x float> %{{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtps + // COMMONIR: bitcast i16 %{{.*}} to <16 x i1> // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_ps(__U ,__A); } @@ -206,3 +206,4 @@ __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) // COMMONIR: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_cvtph_ps (__U,__A); } + diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 1feb0fbea2022..0e3463849951e 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -13,8 +13,8 @@ __m512d test_mm512_sqrt_pd(__m512d a) __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_pd - // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_pd (__W,__U,__A); } @@ -22,8 +22,8 @@ __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_pd - // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}}) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_pd (__U,__A); } @@ -31,8 +31,8 @@ __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd - // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -40,8 +40,8 @@ __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A) __m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd - // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11) + // CHECK: bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}} return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -63,8 +63,8 @@ __m512 test_mm512_sqrt_ps(__m512 a) __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_ps - // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_ps( __W, __U, __A); } @@ -72,8 +72,8 @@ __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_ps - // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}}) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_ps(__U ,__A); } @@ -81,8 +81,8 @@ __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps - // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -90,8 +90,8 @@ __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A) __m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps - // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11) + // CHECK: bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}} return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -2709,96 +2709,96 @@ __mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) __m512i test_mm512_mask_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_and_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: and <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_and_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_and_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: and <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_and_epi32(__k,__a, __b); } __m512i test_mm512_mask_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_and_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[AND_RES:.*]] = and <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_mask_and_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_and_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[AND_RES:.*]] = and <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_maskz_and_epi64(__k,__a, __b); } __m512i test_mm512_mask_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_or_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: or <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_or_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_or_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: or <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_or_epi32(__k,__a, __b); } __m512i test_mm512_mask_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_or_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[OR_RES:.*]] = or <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}} return _mm512_mask_or_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_or_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[OR_RES:.*]] = or <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}} return _mm512_maskz_or_epi64(__k,__a, __b); } __m512i test_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_xor_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: xor <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_xor_epi32(__src, __k,__a, __b); } __m512i test_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_xor_epi32 - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: xor <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_xor_epi32(__k,__a, __b); } __m512i test_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_mask_xor_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}} return _mm512_mask_xor_epi64(__src, __k,__a, __b); } __m512i test_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_maskz_xor_epi64 - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[XOR_RES:.*]] = xor <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}} return _mm512_maskz_xor_epi64(__k,__a, __b); } @@ -10769,8 +10769,8 @@ __m512d test_mm512_abs_pd(__m512d a){ __m512d test_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A){ // CHECK-LABEL: @test_mm512_mask_abs_pd - // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %[[AND_RES:.*]] = and <8 x i64> + // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1> // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}} return _mm512_mask_abs_pd (__W,__U,__A); } @@ -10783,8 +10783,8 @@ __m512 test_mm512_abs_ps(__m512 a){ __m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){ // CHECK-LABEL: @test_mm512_mask_abs_ps - // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: and <16 x i32> + // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1> // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_abs_ps( __W, __U, __A); } diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index 27cd2c18dc373..a766476ca92bd 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -701,7 +701,6 @@ __m512h test_mm512_conj_pch(__m512h __A) { __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 - // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> @@ -709,6 +708,7 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> return _mm512_mask_conj_pch(__W, __U, __A); @@ -717,13 +717,13 @@ __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) { __m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i16 - // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32> // CHECK: %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float> // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half> return _mm512_maskz_conj_pch(__U, __A); @@ -2052,16 +2052,16 @@ __m512h test_mm512_sqrt_round_ph(__m512h __A) { __m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph - // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } __m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph - // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } @@ -2073,15 +2073,15 @@ __m512h test_mm512_sqrt_ph(__m512h __A) { } __m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_mask_sqrt_ph - // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} return _mm512_mask_sqrt_ph(__W, __U, __A); } __m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { // CHECK-LABEL: @test_mm512_maskz_sqrt_ph - // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} return _mm512_maskz_sqrt_ph(__U, __A); } diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 484001df48b80..6f544c21e798d 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -2836,210 +2836,210 @@ __mmask8 test_mm_mask_cmp_pd_mask_true_us(__mmask8 m, __m128d a, __m128d b) { __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmadd_pd(__A, __B, __C, __U); } __m128d test_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmadd_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmadd_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fnmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmadd_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmadd_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fnmsub_pd(__U, __A, __B, __C); } __m128 test_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmadd_ps(__A, __B, __C, __U); } __m128 test_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmadd_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmadd_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fnmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fnmsub_ps(__U, __A, __B, __C); } @@ -3108,90 +3108,90 @@ __m256 test_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 _ __m128d test_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmaddsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fmsubadd_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m128d test_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK-NOT: fneg // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m128d test_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_fmsubadd_pd(__U, __A, __B, __C); } __m256d test_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmaddsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fmsubadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmaddsub_pd(__A, __B, __C, __U); } __m256d test_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmaddsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmaddsub_pd(__U, __A, __B, __C); } __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_maskz_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_fmsubadd_pd(__U, __A, __B, __C); } @@ -3199,44 +3199,44 @@ __m256d test_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m __m128 test_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmaddsub_ps // CHECK-NOT: fneg - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmaddsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fmsubadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmsubadd_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmaddsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmaddsub_ps(__A, __B, __C, __U); } __m128 test_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmaddsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK-NOT: fneg // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmaddsub_ps(__U, __A, __B, __C); } __m128 test_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_maskz_fmsubadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_fmsubadd_ps(__U, __A, __B, __C); } @@ -3283,27 +3283,27 @@ __m256 test_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __m128d test_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsub_ps(__A, __B, __C, __U); } @@ -3318,27 +3318,27 @@ __m256 test_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __ __m128d test_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}} // CHECK: call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m256d test_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fmsubadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x double> %{{.+}} // CHECK: call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fmsubadd_pd(__A, __B, __C, __U); } __m128 test_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fmsubadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}} // CHECK: call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]]) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fmsubadd_ps(__A, __B, __C, __U); } @@ -3353,27 +3353,27 @@ __m256 test_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __m128d test_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmadd_pd(__A, __U, __B, __C); } __m256d test_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmadd_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmadd_pd(__A, __U, __B, __C); } __m128 test_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmadd_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmadd_ps(__A, __U, __B, __C); } @@ -3388,60 +3388,60 @@ __m256 test_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __ __m128d test_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_fnmsub_pd(__A, __U, __B, __C); } __m128d test_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: fneg <2 x double> %{{.*}} // CHECK: fneg <2 x double> %{{.*}} // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask3_fnmsub_pd(__A, __B, __C, __U); } __m256d test_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { // CHECK-LABEL: @test_mm256_mask_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_fnmsub_pd(__A, __U, __B, __C); } __m256d test_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { // CHECK-LABEL: @test_mm256_mask3_fnmsub_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x double> %{{.*}} // CHECK: fneg <4 x double> %{{.*}} // CHECK: call <4 x double> @llvm.fma.v4f64(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask3_fnmsub_pd(__A, __B, __C, __U); } __m128 test_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { // CHECK-LABEL: @test_mm_mask_fnmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fnmsub_ps(__A, __U, __B, __C); } __m128 test_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { // CHECK-LABEL: @test_mm_mask3_fnmsub_ps - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: fneg <4 x float> %{{.*}} // CHECK: fneg <4 x float> %{{.*}} // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask3_fnmsub_ps(__A, __B, __C, __U); } @@ -6668,22 +6668,22 @@ __m256d test_mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { __m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) { // CHECK-LABEL: @test_mm_mask_set1_epi32 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0 // CHECK: insertelement <4 x i32> %{{.*}}32 1 // CHECK: insertelement <4 x i32> %{{.*}}32 2 // CHECK: insertelement <4 x i32> %{{.*}}32 3 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_set1_epi32(__O, __M, 5); } __m128i test_mm_maskz_set1_epi32(__mmask8 __M) { // CHECK-LABEL: @test_mm_maskz_set1_epi32 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0 // CHECK: insertelement <4 x i32> %{{.*}}32 1 // CHECK: insertelement <4 x i32> %{{.*}}32 2 // CHECK: insertelement <4 x i32> %{{.*}}32 3 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_set1_epi32(__M, 5); } @@ -6718,40 +6718,40 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_mask_set1_epi64 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_set1_epi64(__O, __M, __A); } __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm_maskz_set1_epi64 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <2 x i32> // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <2 x i32> // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_set1_epi64(__M, __A); } __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm256_mask_set1_epi64 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_set1_epi64(__O, __M, __A); } __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: @test_mm256_maskz_set1_epi64 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2 // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3 + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_set1_epi64(__M, __A); } @@ -7108,14 +7108,14 @@ __m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_unpackhi_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); } __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_unpackhi_pd - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_unpackhi_pd(__U, __A, __B); } @@ -7931,16 +7931,16 @@ __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } @@ -7973,16 +7973,16 @@ __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2 - // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> + // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); } diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c index 7d41015a3a729..3a212ed683437 100644 --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -419,7 +419,6 @@ __m256h test_mm256_conj_pch(__m256h __A) { __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { // CHECK-LABEL: @test_mm256_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> @@ -427,6 +426,7 @@ __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> return _mm256_mask_conj_pch(__W, __U, __A); @@ -435,13 +435,13 @@ __m256h test_mm256_mask_conj_pch(__m256h __W, __mmask32 __U, __m256h __A) { __m256h test_mm256_maskz_conj_pch(__mmask32 __U, __m256h __A) { // CHECK-LABEL: @test_mm256_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <8 x i32> // CHECK: %{{.*}} = xor <8 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <8 x i32> %{{.*}} to <8 x float> // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <8 x float> %{{.*}} to <16 x half> return _mm256_maskz_conj_pch(__U, __A); @@ -461,8 +461,6 @@ __m128h test_mm_conj_pch(__m128h __A) { __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { // CHECK-LABEL: @test_mm_mask_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> @@ -470,6 +468,8 @@ __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_mask_conj_pch(__W, __U, __A); @@ -478,14 +478,14 @@ __m128h test_mm_mask_conj_pch(__m128h __W, __mmask32 __U, __m128h __A) { __m128h test_mm_maskz_conj_pch(__mmask32 __U, __m128h __A) { // CHECK-LABEL: @test_mm_maskz_conj_pch // CHECK: %{{.*}} = trunc i32 %{{.*}} to i8 - // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <4 x i32> // CHECK: %{{.*}} = xor <4 x i32> %{{.*}}, %{{.*}} // CHECK: %{{.*}} = bitcast <4 x i32> %{{.*}} to <4 x float> // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} // CHECK: %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half> return _mm_maskz_conj_pch(__U, __A); @@ -2844,8 +2844,8 @@ __m128h test_mm_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C) { __m128h test_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { // CHECK-LABEL: @test_mm_mask_fcmadd_pch - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.128 + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fcmadd_pch(__A, __U, __B, __C); } @@ -2933,8 +2933,8 @@ __m128h test_mm_fmadd_pch(__m128h __A, __m128h __B, __m128h __C) { __m128h test_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { // CHECK-LABEL: @test_mm_mask_fmadd_pch - // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> poison, <4 x i32> // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.128 + // CHECK: %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> // CHECK: %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_fmadd_pch(__A, __U, __B, __C); }