From 0f0effd2cfd05ebb23c00550c487702672140c87 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 13:03:31 +0300 Subject: [PATCH 01/12] feat: [Headers][X86] Allow AVX512 masked shuffles to be used in constexpr --- clang/lib/Headers/avx512dqintrin.h | 24 ++-- clang/lib/Headers/avx512fintrin.h | 48 ++++---- clang/lib/Headers/avx512vldqintrin.h | 20 ++-- clang/lib/Headers/avx512vlintrin.h | 132 ++++++++++----------- clang/test/CodeGen/X86/avx512dq-builtins.c | 17 +++ clang/test/CodeGen/X86/avx512f-builtins.c | 76 ++++++++++++ clang/test/CodeGen/X86/avx512vl-builtins.c | 80 +++++++++++++ 7 files changed, 285 insertions(+), 112 deletions(-) diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index fb65bf933b8ad..a5c1b78ad4305 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -1083,7 +1083,7 @@ _mm512_broadcast_f32x2(__m128 __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1091,7 +1091,7 @@ _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1106,7 +1106,7 @@ _mm512_broadcast_f32x8(__m256 __A) { 0, 1, 2, 3, 4, 5, 6, 7); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1114,7 +1114,7 @@ _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -1128,7 +1128,7 @@ _mm512_broadcast_f64x2(__m128d __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -1136,7 +1136,7 @@ _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) (__v8df)__O); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -1151,7 +1151,7 @@ _mm512_broadcast_i32x2(__m128i __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1159,7 +1159,7 @@ _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1174,7 +1174,7 @@ _mm512_broadcast_i32x8(__m256i __A) { 0, 1, 2, 3, 4, 5, 6, 7); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1182,7 +1182,7 @@ _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1196,7 +1196,7 @@ _mm512_broadcast_i64x2(__m128i __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1204,7 +1204,7 @@ _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) (__v8di)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 80e58425cdd71..14d79bdba3844 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -225,7 +225,7 @@ _mm512_broadcastd_epi32(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, @@ -233,7 +233,7 @@ _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) (__v16si) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, @@ -247,7 +247,7 @@ _mm512_broadcastq_epi64(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, @@ -256,7 +256,7 @@ _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, @@ -321,7 +321,7 @@ _mm512_set1_epi32(int __s) __s, __s, __s, __s, __s, __s, __s, __s }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { return (__m512i)__builtin_ia32_selectd_512(__M, @@ -335,7 +335,7 @@ _mm512_set1_epi64(long long __d) return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; } -static __inline __m512i __DEFAULT_FN_ATTRS512 +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m512i)__builtin_ia32_selectq_512(__M, @@ -6552,7 +6552,7 @@ _mm512_broadcast_f32x4(__m128 __A) { 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -6560,7 +6560,7 @@ _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -6597,7 +6597,7 @@ _mm512_broadcast_i32x4(__m128i __A) { 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -6605,7 +6605,7 @@ _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -6635,7 +6635,7 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, @@ -6643,7 +6643,7 @@ _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) (__v8df) __O); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, @@ -6651,7 +6651,7 @@ _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) (__v8df) _mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, @@ -6659,7 +6659,7 @@ _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) (__v16sf) __O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, @@ -8381,7 +8381,7 @@ _mm512_movehdup_ps (__m512 __A) 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -8389,7 +8389,7 @@ _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -8404,7 +8404,7 @@ _mm512_moveldup_ps (__m512 __A) 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -8412,7 +8412,7 @@ _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -8420,26 +8420,26 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), _mm_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), @@ -8884,7 +8884,7 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) } #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS512 + static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { return (__m512i) __builtin_ia32_selectd_512(__M, @@ -8892,7 +8892,7 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) (__v16si) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 + static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { return (__m512i) __builtin_ia32_selectq_512(__M, diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 68bd52e43981a..bc0ab63c20351 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -968,7 +968,7 @@ _mm256_broadcast_f32x2(__m128 __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -976,7 +976,7 @@ _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) (__v8sf)__O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -990,7 +990,7 @@ _mm256_broadcast_f64x2(__m128d __A) { 0, 1, 0, 1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, @@ -998,7 +998,7 @@ _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) (__v4df)__O); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, @@ -1012,7 +1012,7 @@ _mm_broadcast_i32x2(__m128i __A) { 0, 1, 0, 1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -1020,7 +1020,7 @@ _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) (__v4si)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -1034,7 +1034,7 @@ _mm256_broadcast_i32x2(__m128i __A) { 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -1042,7 +1042,7 @@ _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -1056,7 +1056,7 @@ _mm256_broadcast_i64x2(__m128i __A) { 0, 1, 0, 1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -1064,7 +1064,7 @@ _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) (__v4di)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 965741f0ff944..1cef356eddb0a 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -5101,7 +5101,7 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -5109,7 +5109,7 @@ _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) (__v4si)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi32( __mmask8 __M, int __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -5117,7 +5117,7 @@ _mm_maskz_set1_epi32( __mmask8 __M, int __A) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -5125,7 +5125,7 @@ _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi32( __mmask8 __M, int __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -5134,7 +5134,7 @@ _mm256_maskz_set1_epi32( __mmask8 __M, int __A) } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, @@ -5142,7 +5142,7 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) (__v2di) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, @@ -5150,7 +5150,7 @@ _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) (__v2di) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { return (__m256i) __builtin_ia32_selectq_256(__M, @@ -5158,7 +5158,7 @@ _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) (__v4di) __O) ; } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m256i) __builtin_ia32_selectq_256(__M, @@ -5611,7 +5611,7 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5619,7 +5619,7 @@ _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5627,7 +5627,7 @@ _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5635,7 +5635,7 @@ _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5643,7 +5643,7 @@ _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5651,7 +5651,7 @@ _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5659,7 +5659,7 @@ _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5667,7 +5667,7 @@ _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5675,7 +5675,7 @@ _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5683,7 +5683,7 @@ _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5691,7 +5691,7 @@ _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5699,7 +5699,7 @@ _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5707,7 +5707,7 @@ _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5715,7 +5715,7 @@ _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5723,7 +5723,7 @@ _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5731,7 +5731,7 @@ _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -6055,7 +6055,7 @@ _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6063,7 +6063,7 @@ _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6071,7 +6071,7 @@ _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6079,7 +6079,7 @@ _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6087,7 +6087,7 @@ _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6095,7 +6095,7 @@ _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6103,7 +6103,7 @@ _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6111,7 +6111,7 @@ _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6119,7 +6119,7 @@ _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6127,7 +6127,7 @@ _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6135,7 +6135,7 @@ _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6143,7 +6143,7 @@ _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6151,7 +6151,7 @@ _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6159,7 +6159,7 @@ _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6167,7 +6167,7 @@ _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6175,7 +6175,7 @@ _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6594,7 +6594,7 @@ _mm256_broadcast_f32x4(__m128 __A) { 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -6602,7 +6602,7 @@ _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) (__v8sf)__O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -6616,7 +6616,7 @@ _mm256_broadcast_i32x4(__m128i __A) { 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -6624,7 +6624,7 @@ _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -6632,7 +6632,7 @@ _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, @@ -6640,7 +6640,7 @@ _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) (__v4df) __O); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, @@ -6648,7 +6648,7 @@ _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) (__v4df) _mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, @@ -6656,7 +6656,7 @@ _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) (__v4sf) __O); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, @@ -6664,7 +6664,7 @@ _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) (__v4sf) _mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, @@ -6672,7 +6672,7 @@ _mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) (__v8sf) __O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, @@ -6680,7 +6680,7 @@ _mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) (__v8sf) _mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -6688,7 +6688,7 @@ _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) (__v4si) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -6696,7 +6696,7 @@ _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) (__v4si) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -6704,7 +6704,7 @@ _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) (__v8si) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -6712,7 +6712,7 @@ _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) (__v8si) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, @@ -6720,7 +6720,7 @@ _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) (__v2di) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, @@ -6728,7 +6728,7 @@ _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v2di) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, @@ -6736,7 +6736,7 @@ _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) (__v4di) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, @@ -8003,7 +8003,7 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ (__v4di)_mm256_setzero_si256())) -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8011,7 +8011,7 @@ _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8019,7 +8019,7 @@ _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8027,7 +8027,7 @@ _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8035,7 +8035,7 @@ _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8043,7 +8043,7 @@ _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8051,7 +8051,7 @@ _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8059,7 +8059,7 @@ _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 4112561216af8..0676b5b091588 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -12,6 +12,23 @@ #include #include "builtin_test_helpers.h" +// constexpr coverage for i32x8/f32x8 broadcasts (DQ) +{ + __m256i a = _mm256_set1_epi32(8); + TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, a), + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8)); +} +{ + __m256 a = _mm256_set1_ps(5.0f); + TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, a), + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5)); +} +{ + __m256 a = _mm256_set1_ps(7.0f); + TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, a), + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7)); +} + __mmask8 test_knot_mask8(__mmask8 a) { // CHECK-LABEL: test_knot_mask8 // CHECK: [[IN:%.*]] = bitcast i8 %{{.*}} to <8 x i1> diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 7756f0da18c03..0caa23df95982 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -11,6 +11,82 @@ #include #include "builtin_test_helpers.h" +// constexpr coverage for masked set1/broadcast/unpack/move_ss/move_sd (F) +TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xFFFF, 13), + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13)); +TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xFF, 21), + 21,21,21,21,21,21,21,21)); + +{ + __m128 a = (__m128)(__v4sf){1,2,3,4}; + TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, a), + 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); +} +{ + __m128 a = (__m128)(__v4sf){1,2,3,4}; + TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, a), + 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); +} +{ + __m128d a = (__m128d)(__v2df){1,2}; + TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, a), + 1,2,1,2,1,2,1,2)); +} +{ + __m128d a = (__m128d)(__v2df){1,2}; + TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, a), + 1,2,1,2,1,2,1,2)); +} + +// additionally add i32x4/i32x8 z-forms +{ + __m128i a = (__m128i)(__v4si){0,1,2,3}; + TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, a), + 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); +} +{ + __m256i a = _mm256_set1_epi32(9); + TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, a), + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); +} + +// unpack and moves (512) +{ + __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); + __m512 b = _mm512_set1_ps(0); + TEST_CONSTEXPR(match_m512(_mm512_mask_unpackhi_ps(_mm512_setzero_ps(), 0xFFFF, a, b), + 15,0,13,0,11,0,9,0,7,0,5,0,3,0,1,0)); +} +{ + __m512 a = _mm512_set1_ps(1); + __m512 b = _mm512_set1_ps(2); + TEST_CONSTEXPR(match_m512(_mm512_mask_unpacklo_ps(_mm512_setzero_ps(), 0xFFFF, a, b), + 1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2)); +} + +// move_ss/move_sd in constexpr masked form +{ + __m128 a = (__m128)(__v4sf){10,2,3,4}; + __m128 b = (__m128)(__v4sf){20,6,7,8}; + TEST_CONSTEXPR(match_m128(_mm_mask_move_ss(_mm_setzero_ps(), 0x1, a, b), 20,0,0,0)); +} +{ + __m128d a = (__m128d)(__v2df){10,2}; + __m128d b = (__m128d)(__v2df){20,6}; + TEST_CONSTEXPR(match_m128d(_mm_mask_move_sd(_mm_setzero_pd(), 0x1, a, b), 20,0)); +} +// z-forms for movehdup/moveldup +{ + __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); + TEST_CONSTEXPR(match_m512(_mm512_maskz_movehdup_ps(0xFFFF, a), + 15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1)); +} +{ + __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); + TEST_CONSTEXPR(match_m512(_mm512_maskz_moveldup_ps(0xFFFF, a), + 16,16,14,14,12,12,10,10,8,8,6,6,4,4,2,2)); +} + __m512d test_mm512_sqrt_pd(__m512d a) { // CHECK-LABEL: test_mm512_sqrt_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 51385d57d2944..1d1d3520d7659 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7,6 +7,86 @@ #include #include "builtin_test_helpers.h" +// constexpr coverage for masked set1/broadcast/unpack/movehdup/moveldup (VL) +TEST_CONSTEXPR(match_v4si(_mm_mask_set1_epi32(_mm_setzero_si128(), 0xF, 7), 7, 7, 7, 7)); +TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9)); + +TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xFF, 5), 5, 5, 5, 5, 5, 5, 5, 5)); +TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11)); + +{ + __m128i a = (__m128i)(__v4si){0,1,2,3}; + TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, a), 0,1,0,1)); +} +{ + __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; + TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, a), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); +} +{ + __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; + TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, a), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); +} +{ + __m128d a = (__m128d)(__v2df){1.0,2.0}; + TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, a), 1.0,2.0,1.0,2.0)); +} +{ + __m128d a = (__m128d)(__v2df){1.0,2.0}; + TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, a), 1.0,2.0,1.0,2.0)); +} + +// i64x2 maskz +{ + __m128i a = (__m128i)(__v2di){1,2}; + TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, a), 1,2,1,2)); +} + +// i32x4/f32x4 maskz (VL) +{ + __m128i a = (__m128i)(__v4si){0,1,2,3}; + TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, a), 0,1,2,3,0,1,2,3)); +} +{ + __m128 a = (__m128)(__v4sf){0,1,2,3}; + TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, a), 0,1,2,3,0,1,2,3)); +} + +// i32x2 maskz (128/256) +{ + __m128i a = (__m128i)(__v4si){0,1,2,3}; + TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, a), 0,1,0,1)); +} +{ + __m128i a = (__m128i)(__v4si){0,1,2,3}; + TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, a), 0,1,0,1,0,1,0,1)); +} + +// unpackhi/lo with full mask behaves like the underlying unpack +{ + __m128d a = (__m128d)(__v2df){1.0,2.0}; + __m128d b = (__m128d)(__v2df){3.0,4.0}; + TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, a, b), 2.0,4.0)); + TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, a, b), 1.0,3.0)); +} +{ + __m256d a = (__m256d)(__v4df){1.0,2.0,3.0,4.0}; + __m256d b = (__m256d)(__v4df){5.0,6.0,7.0,8.0}; + TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, a, b), 2.0,6.0,4.0,8.0)); + TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, a, b), 1.0,5.0,3.0,7.0)); +} + +// movehdup/moveldup with full mask equals the underlying *_move*dup +{ + __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; + TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, a), 2.f,2.f,4.f,4.f)); + TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, a), 1.f,1.f,3.f,3.f)); +} +{ + __m256 a = (__m256)(__v8sf){1,2,3,4,5,6,7,8}; + TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, a), 2,2,4,4,6,6,8,8)); + TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, a), 1,1,3,3,5,5,7,7)); +} + __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { // CHECK-LABEL: test_mm_cmpeq_epu32_mask // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}} From facdc7e0663aaeebaf107b16108e6d82857c1552 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 13:06:55 +0300 Subject: [PATCH 02/12] chor: apply formatting --- clang/lib/Headers/avx512dqintrin.h | 36 ++--- clang/lib/Headers/avx512fintrin.h | 77 +++------ clang/lib/Headers/avx512vldqintrin.h | 30 ++-- clang/lib/Headers/avx512vlintrin.h | 229 ++++++++++----------------- 4 files changed, 128 insertions(+), 244 deletions(-) diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index a5c1b78ad4305..3681ccab0b179 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -1084,16 +1084,14 @@ _mm512_broadcast_f32x2(__m128 __A) { } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) -{ +_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x2(__A), (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) -{ +_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x2(__A), (__v16sf)_mm512_setzero_ps()); @@ -1107,16 +1105,14 @@ _mm512_broadcast_f32x8(__m256 __A) { } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) -{ +_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x8(__A), (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) -{ +_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x8(__A), (__v16sf)_mm512_setzero_ps()); @@ -1129,16 +1125,14 @@ _mm512_broadcast_f64x2(__m128d __A) { } static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) -{ +_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, (__v8df)_mm512_broadcast_f64x2(__A), (__v8df)__O); } static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) -{ +_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, (__v8df)_mm512_broadcast_f64x2(__A), (__v8df)_mm512_setzero_pd()); @@ -1152,16 +1146,14 @@ _mm512_broadcast_i32x2(__m128i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) -{ +_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x2(__A), (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) -{ +_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x2(__A), (__v16si)_mm512_setzero_si512()); @@ -1175,16 +1167,14 @@ _mm512_broadcast_i32x8(__m256i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) -{ +_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x8(__A), (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) -{ +_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x8(__A), (__v16si)_mm512_setzero_si512()); @@ -1197,16 +1187,14 @@ _mm512_broadcast_i64x2(__m128i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) -{ +_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_broadcast_i64x2(__A), (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) -{ +_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_broadcast_i64x2(__A), (__v8di)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 14d79bdba3844..8401647ef9da5 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -226,16 +226,14 @@ _mm512_broadcastd_epi32(__m128i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) -{ +_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, (__v16si) _mm512_broadcastd_epi32(__A), (__v16si) __O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) -{ +_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, (__v16si) _mm512_broadcastd_epi32(__A), (__v16si) _mm512_setzero_si512()); @@ -248,17 +246,14 @@ _mm512_broadcastq_epi64(__m128i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) -{ +_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, (__v8di) _mm512_broadcastq_epi64(__A), (__v8di) __O); - } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ +_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, (__v8di) _mm512_broadcastq_epi64(__A), (__v8di) _mm512_setzero_si512()); @@ -322,8 +317,7 @@ _mm512_set1_epi32(int __s) } static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_set1_epi32(__mmask16 __M, int __A) -{ +_mm512_maskz_set1_epi32(__mmask16 __M, int __A) { return (__m512i)__builtin_ia32_selectd_512(__M, (__v16si)_mm512_set1_epi32(__A), (__v16si)_mm512_setzero_si512()); @@ -336,8 +330,7 @@ _mm512_set1_epi64(long long __d) } static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) -{ +_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m512i)__builtin_ia32_selectq_512(__M, (__v8di)_mm512_set1_epi64(__A), (__v8di)_mm512_setzero_si512()); @@ -6553,16 +6546,14 @@ _mm512_broadcast_f32x4(__m128 __A) { } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) -{ +_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x4(__A), (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) -{ +_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x4(__A), (__v16sf)_mm512_setzero_ps()); @@ -6598,16 +6589,14 @@ _mm512_broadcast_i32x4(__m128i __A) { } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) -{ +_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x4(__A), (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) -{ +_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x4(__A), (__v16si)_mm512_setzero_si512()); @@ -6636,32 +6625,28 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) } static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) -{ +_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, (__v8df) _mm512_broadcastsd_pd(__A), (__v8df) __O); } static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) -{ +_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, (__v8df) _mm512_broadcastsd_pd(__A), (__v8df) _mm512_setzero_pd()); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) -{ +_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, (__v16sf) _mm512_broadcastss_ps(__A), (__v16sf) __O); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) -{ +_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, (__v16sf) _mm512_broadcastss_ps(__A), (__v16sf) _mm512_setzero_ps()); @@ -8382,16 +8367,14 @@ _mm512_movehdup_ps (__m512 __A) } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ +_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_movehdup_ps(__A), (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) -{ +_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_movehdup_ps(__A), (__v16sf)_mm512_setzero_ps()); @@ -8405,43 +8388,37 @@ _mm512_moveldup_ps (__m512 __A) } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ +_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_moveldup_ps(__A), (__v16sf)__W); } static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) -{ +_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_moveldup_ps(__A), (__v16sf)_mm512_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), _mm_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) { return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), _mm_setzero_pd()); } @@ -8884,17 +8861,15 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) } #endif - static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) { return (__m512i) __builtin_ia32_selectd_512(__M, (__v16si) _mm512_set1_epi32(__A), (__v16si) __O); } - static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR -_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) { return (__m512i) __builtin_ia32_selectq_512(__M, (__v8di) _mm512_set1_epi64(__A), (__v8di) __O); diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index bc0ab63c20351..ee7974e924afe 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -969,16 +969,14 @@ _mm256_broadcast_f32x2(__m128 __A) { } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) -{ +_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, (__v8sf)_mm256_broadcast_f32x2(__A), (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) -{ +_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, (__v8sf)_mm256_broadcast_f32x2(__A), (__v8sf)_mm256_setzero_ps()); @@ -991,16 +989,14 @@ _mm256_broadcast_f64x2(__m128d __A) { } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) -{ +_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, (__v4df)_mm256_broadcast_f64x2(__A), (__v4df)__O); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) -{ +_mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, (__v4df)_mm256_broadcast_f64x2(__A), (__v4df)_mm256_setzero_pd()); @@ -1013,16 +1009,14 @@ _mm_broadcast_i32x2(__m128i __A) { } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) -{ +_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_broadcast_i32x2(__A), (__v4si)__O); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) -{ +_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_broadcast_i32x2(__A), (__v4si)_mm_setzero_si128()); @@ -1035,16 +1029,14 @@ _mm256_broadcast_i32x2(__m128i __A) { } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) -{ +_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_broadcast_i32x2(__A), (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) -{ +_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_broadcast_i32x2(__A), (__v8si)_mm256_setzero_si256()); @@ -1057,16 +1049,14 @@ _mm256_broadcast_i64x2(__m128i __A) { } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) -{ +_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_broadcast_i64x2(__A), (__v4di)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) -{ +_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_broadcast_i64x2(__A), (__v4di)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 1cef356eddb0a..676b5a07e2521 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -5102,68 +5102,54 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)__O); +_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { + return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A), + (__v4si)__O); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_set1_epi32( __mmask8 __M, int __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)_mm_setzero_si128()); +_mm_maskz_set1_epi32(__mmask8 __M, int __A) { + return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)__O); +_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { + return (__m256i)__builtin_ia32_selectd_256( + __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_set1_epi32( __mmask8 __M, int __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)_mm256_setzero_si256()); +_mm256_maskz_set1_epi32(__mmask8 __M, int __A) { + return (__m256i)__builtin_ia32_selectd_256( + __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)_mm256_setzero_si256()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) -{ +_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, (__v2di) _mm_set1_epi64x(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) -{ +_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, (__v2di) _mm_set1_epi64x(__A), (__v2di) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) -{ +_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { return (__m256i) __builtin_ia32_selectq_256(__M, (__v4di) _mm256_set1_epi64x(__A), (__v4di) __O) ; } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) -{ - return (__m256i) __builtin_ia32_selectq_256(__M, - (__v4di) _mm256_set1_epi64x(__A), - (__v4di) _mm256_setzero_si256()); +_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { + return (__m256i)__builtin_ia32_selectq_256( + __M, (__v4di)_mm256_set1_epi64x(__A), (__v4di)_mm256_setzero_si256()); } #define _mm_fixupimm_pd(A, B, C, imm) \ @@ -5610,130 +5596,113 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) (__mmask8) __U); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_unpackhi_pd(__A, __B), (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_unpackhi_pd(__A, __B), (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) -{ +_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_unpackhi_pd(__A, __B), (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) -{ +_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_unpackhi_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_unpackhi_ps(__A, __B), (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_unpackhi_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) -{ +_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_unpackhi_ps(__A, __B), (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) -{ +_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_unpackhi_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_unpacklo_pd(__A, __B), (__v2df)__W); } static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) -{ +_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_unpacklo_pd(__A, __B), (__v2df)_mm_setzero_pd()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) -{ +_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_unpacklo_pd(__A, __B), (__v4df)__W); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) -{ +_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_unpacklo_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_unpacklo_ps(__A, __B), (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) -{ +_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_unpacklo_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) -{ +_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_unpacklo_ps(__A, __B), (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) -{ +_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_unpacklo_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); @@ -6056,128 +6025,116 @@ _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_unpackhi_epi32(__A, __B), (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_unpackhi_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_unpackhi_epi32(__A, __B), (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_unpackhi_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpackhi_epi64(__A, __B), (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpackhi_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_unpackhi_epi64(__A, __B), (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_unpackhi_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_unpacklo_epi32(__A, __B), (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_unpacklo_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_unpacklo_epi32(__A, __B), (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_unpacklo_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpacklo_epi64(__A, __B), (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpacklo_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_unpacklo_epi64(__A, __B), (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ +_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_unpacklo_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); @@ -6595,16 +6552,14 @@ _mm256_broadcast_f32x4(__m128 __A) { } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) -{ +_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, (__v8sf)_mm256_broadcast_f32x4(__A), (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) -{ +_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, (__v8sf)_mm256_broadcast_f32x4(__A), (__v8sf)_mm256_setzero_ps()); @@ -6617,128 +6572,112 @@ _mm256_broadcast_i32x4(__m128i __A) { } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) -{ +_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_broadcast_i32x4(__A), (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) -{ +_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_broadcast_i32x4(__A), (__v8si)_mm256_setzero_si256()); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) -{ +_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, (__v4df) _mm256_broadcastsd_pd(__A), (__v4df) __O); } static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) -{ +_mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, (__v4df) _mm256_broadcastsd_pd(__A), (__v4df) _mm256_setzero_pd()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) -{ +_mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, (__v4sf) _mm_broadcastss_ps(__A), (__v4sf) __O); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) -{ +_mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, (__v4sf) _mm_broadcastss_ps(__A), (__v4sf) _mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) -{ +_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, (__v8sf) _mm256_broadcastss_ps(__A), (__v8sf) __O); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) -{ +_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, (__v8sf) _mm256_broadcastss_ps(__A), (__v8sf) _mm256_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) -{ +_mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si) _mm_broadcastd_epi32(__A), (__v4si) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) -{ +_mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si) _mm_broadcastd_epi32(__A), (__v4si) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) -{ +_mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, (__v8si) _mm256_broadcastd_epi32(__A), (__v8si) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) -{ +_mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, (__v8si) _mm256_broadcastd_epi32(__A), (__v8si) _mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) -{ +_mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, (__v2di) _mm_broadcastq_epi64(__A), (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ +_mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, (__v2di) _mm_broadcastq_epi64(__A), (__v2di) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) -{ +_mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, (__v4di) _mm256_broadcastq_epi64(__A), (__v4di) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ +_mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, (__v4di) _mm256_broadcastq_epi64(__A), (__v4di) _mm256_setzero_si256()); @@ -8004,64 +7943,56 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256())) static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ +_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_movehdup_ps(__A), (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) -{ +_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_movehdup_ps(__A), (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ +_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_movehdup_ps(__A), (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) -{ +_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_movehdup_ps(__A), (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ +_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_moveldup_ps(__A), (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) -{ +_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_moveldup_ps(__A), (__v4sf)_mm_setzero_ps()); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ +_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_moveldup_ps(__A), (__v8sf)__W); } static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) -{ +_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_moveldup_ps(__A), (__v8sf)_mm256_setzero_ps()); From d9488e509bf71273f673a5ecd5b05024a86801c2 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 17:50:05 +0300 Subject: [PATCH 03/12] refactor: update tests --- clang/test/CodeGen/X86/avx512dq-builtins.c | 23 ++--- clang/test/CodeGen/X86/avx512vl-builtins.c | 114 ++++++++++----------- 2 files changed, 63 insertions(+), 74 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 0676b5b091588..c361a1426547b 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -13,21 +13,20 @@ #include "builtin_test_helpers.h" // constexpr coverage for i32x8/f32x8 broadcasts (DQ) -{ - __m256i a = _mm256_set1_epi32(8); - TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, a), + + + TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, _mm256_set1_epi32(8)), 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8)); -} -{ - __m256 a = _mm256_set1_ps(5.0f); - TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, a), + + + + TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, _mm256_set1_ps(5.0f)), 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5)); -} -{ - __m256 a = _mm256_set1_ps(7.0f); - TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, a), + + + TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, _mm256_set1_ps(7.0f)), 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7)); -} + __mmask8 test_knot_mask8(__mmask8 a) { // CHECK-LABEL: test_knot_mask8 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 1d1d3520d7659..1c4ed17144b01 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -14,78 +14,68 @@ TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9 TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xFF, 5), 5, 5, 5, 5, 5, 5, 5, 5)); TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11)); -{ - __m128i a = (__m128i)(__v4si){0,1,2,3}; - TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, a), 0,1,0,1)); -} -{ - __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; - TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, a), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); -} -{ - __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; - TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, a), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); -} -{ - __m128d a = (__m128d)(__v2df){1.0,2.0}; - TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, a), 1.0,2.0,1.0,2.0)); -} -{ - __m128d a = (__m128d)(__v2df){1.0,2.0}; - TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, a), 1.0,2.0,1.0,2.0)); -} + + + TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); + + + + TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); + + + TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); + + + TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); + + + + TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); + // i64x2 maskz -{ - __m128i a = (__m128i)(__v2di){1,2}; - TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, a), 1,2,1,2)); -} + + TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2)); + // i32x4/f32x4 maskz (VL) -{ - __m128i a = (__m128i)(__v4si){0,1,2,3}; - TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, a), 0,1,2,3,0,1,2,3)); -} -{ - __m128 a = (__m128)(__v4sf){0,1,2,3}; - TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, a), 0,1,2,3,0,1,2,3)); -} + + + TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3)); + + + + TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, (__m128)(__v4sf){0,1,2,3}), 0,1,2,3,0,1,2,3)); + // i32x2 maskz (128/256) -{ - __m128i a = (__m128i)(__v4si){0,1,2,3}; - TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, a), 0,1,0,1)); -} -{ - __m128i a = (__m128i)(__v4si){0,1,2,3}; - TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, a), 0,1,0,1,0,1,0,1)); -} + + TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); + + + + TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); + // unpackhi/lo with full mask behaves like the underlying unpack -{ - __m128d a = (__m128d)(__v2df){1.0,2.0}; - __m128d b = (__m128d)(__v2df){3.0,4.0}; - TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, a, b), 2.0,4.0)); - TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, a, b), 1.0,3.0)); -} -{ - __m256d a = (__m256d)(__v4df){1.0,2.0,3.0,4.0}; - __m256d b = (__m256d)(__v4df){5.0,6.0,7.0,8.0}; - TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, a, b), 2.0,6.0,4.0,8.0)); - TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, a, b), 1.0,5.0,3.0,7.0)); -} + + TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 2.0,4.0)); + TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 1.0,3.0)); + + + TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 2.0,6.0,4.0,8.0)); + TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 1.0,5.0,3.0,7.0)); + // movehdup/moveldup with full mask equals the underlying *_move*dup -{ - __m128 a = (__m128)(__v4sf){1.f,2.f,3.f,4.f}; - TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, a), 2.f,2.f,4.f,4.f)); - TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, a), 1.f,1.f,3.f,3.f)); -} -{ - __m256 a = (__m256)(__v8sf){1,2,3,4,5,6,7,8}; - TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, a), 2,2,4,4,6,6,8,8)); - TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, a), 1,1,3,3,5,5,7,7)); -} + + TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 2.f,2.f,4.f,4.f)); + TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,1.f,3.f,3.f)); + + + TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 2,2,4,4,6,6,8,8)); + TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 1,1,3,3,5,5,7,7)); + __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { // CHECK-LABEL: test_mm_cmpeq_epu32_mask From c9472b61b62b543678e46a78e53a0f413d5a2989 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 18:13:46 +0300 Subject: [PATCH 04/12] refactor: update test --- clang/test/CodeGen/X86/avx512f-builtins.c | 81 +++++------------------ 1 file changed, 15 insertions(+), 66 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 0caa23df95982..207133160e2db 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -17,75 +17,24 @@ TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xFFFF TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xFF, 21), 21,21,21,21,21,21,21,21)); -{ - __m128 a = (__m128)(__v4sf){1,2,3,4}; - TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, a), - 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); -} -{ - __m128 a = (__m128)(__v4sf){1,2,3,4}; - TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, a), - 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); -} -{ - __m128d a = (__m128d)(__v2df){1,2}; - TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, a), - 1,2,1,2,1,2,1,2)); -} -{ - __m128d a = (__m128d)(__v2df){1,2}; - TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, a), - 1,2,1,2,1,2,1,2)); -} +TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, (__m128)(__v4sf){1,2,3,4}), + 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); -// additionally add i32x4/i32x8 z-forms -{ - __m128i a = (__m128i)(__v4si){0,1,2,3}; - TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, a), - 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); -} -{ - __m256i a = _mm256_set1_epi32(9); - TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, a), - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); -} +TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, (__m128)(__v4sf){1,2,3,4}), + 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); -// unpack and moves (512) -{ - __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); - __m512 b = _mm512_set1_ps(0); - TEST_CONSTEXPR(match_m512(_mm512_mask_unpackhi_ps(_mm512_setzero_ps(), 0xFFFF, a, b), - 15,0,13,0,11,0,9,0,7,0,5,0,3,0,1,0)); -} -{ - __m512 a = _mm512_set1_ps(1); - __m512 b = _mm512_set1_ps(2); - TEST_CONSTEXPR(match_m512(_mm512_mask_unpacklo_ps(_mm512_setzero_ps(), 0xFFFF, a, b), - 1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2)); -} +TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, (__m128d)(__v2df){1,2}), + 1,2,1,2,1,2,1,2)); -// move_ss/move_sd in constexpr masked form -{ - __m128 a = (__m128)(__v4sf){10,2,3,4}; - __m128 b = (__m128)(__v4sf){20,6,7,8}; - TEST_CONSTEXPR(match_m128(_mm_mask_move_ss(_mm_setzero_ps(), 0x1, a, b), 20,0,0,0)); -} -{ - __m128d a = (__m128d)(__v2df){10,2}; - __m128d b = (__m128d)(__v2df){20,6}; - TEST_CONSTEXPR(match_m128d(_mm_mask_move_sd(_mm_setzero_pd(), 0x1, a, b), 20,0)); -} -// z-forms for movehdup/moveldup -{ - __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); - TEST_CONSTEXPR(match_m512(_mm512_maskz_movehdup_ps(0xFFFF, a), - 15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1)); -} -{ - __m512 a = _mm512_set_ps(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); - TEST_CONSTEXPR(match_m512(_mm512_maskz_moveldup_ps(0xFFFF, a), - 16,16,14,14,12,12,10,10,8,8,6,6,4,4,2,2)); -} +TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1,2}), + 1,2,1,2,1,2,1,2)); + +// additionally add i32x4/i32x8 z-forms +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, (__m128i)(__v4si){0,1,2,3}), + 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); + +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, _mm256_set1_epi32(9)), + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); __m512d test_mm512_sqrt_pd(__m512d a) { From 1fcea1b13422d7dc3b12faffc8771a446aefd09c Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 21:52:39 +0300 Subject: [PATCH 05/12] refactor: PR Feedback --- clang/test/CodeGen/X86/avx512dq-builtins.c | 19 ++------ clang/test/CodeGen/X86/avx512f-builtins.c | 28 +++++------ clang/test/CodeGen/X86/avx512vl-builtins.c | 54 ++++++++-------------- 3 files changed, 35 insertions(+), 66 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index c361a1426547b..d1e980601c462 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -12,22 +12,6 @@ #include #include "builtin_test_helpers.h" -// constexpr coverage for i32x8/f32x8 broadcasts (DQ) - - - TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, _mm256_set1_epi32(8)), - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8)); - - - - TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, _mm256_set1_ps(5.0f)), - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5)); - - - TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, _mm256_set1_ps(7.0f)), - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7)); - - __mmask8 test_knot_mask8(__mmask8 a) { // CHECK-LABEL: test_knot_mask8 // CHECK: [[IN:%.*]] = bitcast i8 %{{.*}} to <8 x i1> @@ -1321,6 +1305,7 @@ __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* _ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_broadcast_f32x8(__O, __M, _mm256_loadu_ps(__A)); } +TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, _mm256_set1_ps(5.0f)), 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5)); __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f32x8 @@ -1328,6 +1313,7 @@ __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_broadcast_f32x8(__M, _mm256_loadu_ps(__A)); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, _mm256_set1_ps(7.0f)), 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7)); __m512d test_mm512_broadcast_f64x2(double const* __A) { // CHECK-LABEL: test_mm512_broadcast_f64x2 @@ -1384,6 +1370,7 @@ __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i cons // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_broadcast_i32x8(__O, __M, _mm256_loadu_si256(__A)); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, _mm256_set1_epi32(8)), 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8)); __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_i32x8 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 207133160e2db..d06808631ab4e 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -11,28 +11,14 @@ #include #include "builtin_test_helpers.h" -// constexpr coverage for masked set1/broadcast/unpack/move_ss/move_sd (F) -TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xFFFF, 13), - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13)); -TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xFF, 21), - 21,21,21,21,21,21,21,21)); - -TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, (__m128)(__v4sf){1,2,3,4}), - 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); - -TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, (__m128)(__v4sf){1,2,3,4}), - 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); - +// Constexpr coverage for DQ broadcast features (f64x2/i32x8) that don't have test functions in this file. +// The corresponding test_mm512_*_broadcast_* functions are in avx512dq-builtins.c. TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); -// additionally add i32x4/i32x8 z-forms -TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, (__m128i)(__v4si){0,1,2,3}), - 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); - TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, _mm256_set1_epi32(9)), 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); @@ -6861,6 +6847,8 @@ __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* _ return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A)); } +TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, (__m128)(__v4sf){1,2,3,4}), 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); + __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f32x4 // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> @@ -6868,6 +6856,8 @@ __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) { return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, (__m128)(__v4sf){1,2,3,4}), 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); + __m512d test_mm512_broadcast_f64x4(double const* __A) { // CHECK-LABEL: test_mm512_broadcast_f64x4 // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> @@ -6910,6 +6900,8 @@ __m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i const* __A) { return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); + __m512i test_mm512_broadcast_i64x4(__m256i const* __A) { // CHECK-LABEL: test_mm512_broadcast_i64x4 // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> @@ -10928,6 +10920,8 @@ __m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) return _mm512_mask_set1_epi32 ( __O, __M, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xFFFF, 13), 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13)); + __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A) { // CHECK-LABEL: test_mm512_maskz_set1_epi32 @@ -11170,6 +11164,8 @@ __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) return _mm512_mask_set1_epi64 (__O, __M, __A); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xFF, 21), 21,21,21,21,21,21,21,21)); + __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) { // CHECK-LABEL: test_mm512_maskz_set1_epi64 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 1c4ed17144b01..f3b8a4c246623 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7,45 +7,19 @@ #include #include "builtin_test_helpers.h" -// constexpr coverage for masked set1/broadcast/unpack/movehdup/moveldup (VL) -TEST_CONSTEXPR(match_v4si(_mm_mask_set1_epi32(_mm_setzero_si128(), 0xF, 7), 7, 7, 7, 7)); -TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9)); - -TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xFF, 5), 5, 5, 5, 5, 5, 5, 5, 5)); -TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11)); - - - - TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); - +// Constexpr coverage for VLDQ broadcast features (i32x2/f32x2/f64x2/i64x2) that don't have test functions in this file. +// The corresponding test_mm*_*_broadcast_* functions are in avx512vldq-builtins.c. +TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); - - TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); - - - TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); - - - TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); - - - - TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); +TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); +TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); -// i64x2 maskz +TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); - TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2)); +TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); - -// i32x4/f32x4 maskz (VL) - - - TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3)); - - - - TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, (__m128)(__v4sf){0,1,2,3}), 0,1,2,3,0,1,2,3)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2)); // i32x2 maskz (128/256) @@ -7271,6 +7245,8 @@ __m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) { return _mm_mask_set1_epi32(__O, __M, 5); } +TEST_CONSTEXPR(match_v4si(_mm_mask_set1_epi32(_mm_setzero_si128(), 0xF, 7), 7, 7, 7, 7)); + __m128i test_mm_maskz_set1_epi32(__mmask8 __M) { // CHECK-LABEL: test_mm_maskz_set1_epi32 // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0 @@ -7296,6 +7272,8 @@ __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) { return _mm256_mask_set1_epi32(__O, __M, 5); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xFF, 5), 5, 5, 5, 5, 5, 5, 5, 5)); + __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { // CHECK-LABEL: test_mm256_maskz_set1_epi32 // CHECK: insertelement <8 x i32> poison, i32 %{{.*}}, i32 0 @@ -7319,6 +7297,8 @@ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { return _mm_mask_set1_epi64(__O, __M, __A); } +TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9)); + __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: test_mm_maskz_set1_epi64 // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 @@ -7339,6 +7319,8 @@ __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { return _mm256_mask_set1_epi64(__O, __M, __A); } +TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11)); + __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { // CHECK-LABEL: test_mm256_maskz_set1_epi64 // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 @@ -9067,6 +9049,8 @@ __m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { return _mm256_maskz_broadcast_f32x4(__M, __A); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, (__m128)(__v4sf){0,1,2,3}), 0,1,2,3,0,1,2,3)); + __m256i test_mm256_broadcast_i32x4(__m128i const* __A) { // CHECK-LABEL: test_mm256_broadcast_i32x4 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> @@ -9088,6 +9072,8 @@ __m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i const* __A) { return _mm256_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3)); + __m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) { // CHECK-LABEL: test_mm256_mask_broadcastsd_pd // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer From 39619bfe0ac953b3b86772fa7666baf4df6fefe0 Mon Sep 17 00:00:00 2001 From: ahmed Date: Tue, 7 Oct 2025 21:56:16 +0300 Subject: [PATCH 06/12] chore: fix formatting --- clang/lib/Headers/avx512fintrin.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 8401647ef9da5..07de036ef5143 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -247,9 +247,8 @@ _mm512_broadcastq_epi64(__m128i __A) { static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di) _mm512_broadcastq_epi64(__A), - (__v8di) __O); + return (__m512i)__builtin_ia32_selectq_512( + __M, (__v8di)_mm512_broadcastq_epi64(__A), (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR From 539b86e612eda41809f11c3b00f90de66b38949c Mon Sep 17 00:00:00 2001 From: ahmed Date: Thu, 9 Oct 2025 13:09:04 +0300 Subject: [PATCH 07/12] chore: Move tests from avx512vl-builtins --- clang/test/CodeGen/X86/avx512vl-builtins.c | 60 ++++++-------------- clang/test/CodeGen/X86/avx512vldq-builtins.c | 16 ++++++ 2 files changed, 32 insertions(+), 44 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index f3b8a4c246623..71bd5f4949611 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7,50 +7,6 @@ #include #include "builtin_test_helpers.h" -// Constexpr coverage for VLDQ broadcast features (i32x2/f32x2/f64x2/i64x2) that don't have test functions in this file. -// The corresponding test_mm*_*_broadcast_* functions are in avx512vldq-builtins.c. -TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); - -TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); - -TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); - -TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); - -TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); - -TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2)); - - -// i32x2 maskz (128/256) - - TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); - - - - TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); - - -// unpackhi/lo with full mask behaves like the underlying unpack - - TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 2.0,4.0)); - TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 1.0,3.0)); - - - TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 2.0,6.0,4.0,8.0)); - TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 1.0,5.0,3.0,7.0)); - - -// movehdup/moveldup with full mask equals the underlying *_move*dup - - TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 2.f,2.f,4.f,4.f)); - TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,1.f,3.f,3.f)); - - - TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 2,2,4,4,6,6,8,8)); - TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 1,1,3,3,5,5,7,7)); - - __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { // CHECK-LABEL: test_mm_cmpeq_epu32_mask // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}} @@ -7675,6 +7631,8 @@ __m128d test_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d return _mm_mask_unpackhi_pd(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 2.0,4.0)); + __m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_unpackhi_pd // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> @@ -7689,6 +7647,8 @@ __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2 return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 2.0,6.0,4.0,8.0)); + __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_unpackhi_pd // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> @@ -7731,6 +7691,8 @@ __m128d test_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d return _mm_mask_unpacklo_pd(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 1.0,3.0)); + __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_maskz_unpacklo_pd // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> @@ -7745,6 +7707,8 @@ __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2 return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 1.0,5.0,3.0,7.0)); + __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_unpacklo_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> @@ -10380,6 +10344,8 @@ __m128 test_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return _mm_mask_movehdup_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 2.f,2.f,4.f,4.f)); + __m128 test_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { // CHECK-LABEL: test_mm_maskz_movehdup_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> @@ -10394,6 +10360,8 @@ __m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return _mm256_mask_movehdup_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 2,2,4,4,6,6,8,8)); + __m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_movehdup_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> @@ -10408,6 +10376,8 @@ __m128 test_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return _mm_mask_moveldup_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,1.f,3.f,3.f)); + __m128 test_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { // CHECK-LABEL: test_mm_maskz_moveldup_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> @@ -10422,6 +10392,8 @@ __m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return _mm256_mask_moveldup_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 1,1,3,3,5,5,7,7)); + __m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_moveldup_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c index 938845799acf5..a8708c5fe4349 100644 --- a/clang/test/CodeGen/X86/avx512vldq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c @@ -987,6 +987,8 @@ __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) { return _mm256_mask_broadcast_f32x2(__O, __M, __A); } +TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); + __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { // CHECK-LABEL: test_mm256_maskz_broadcast_f32x2 // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> @@ -994,6 +996,8 @@ __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { return _mm256_maskz_broadcast_f32x2(__M, __A); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); + __m256d test_mm256_broadcast_f64x2(double const* __A) { // CHECK-LABEL: test_mm256_broadcast_f64x2 // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> @@ -1008,6 +1012,8 @@ __m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, double const* return _mm256_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); + __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { // CHECK-LABEL: test_mm256_maskz_broadcast_f64x2 // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> @@ -1015,6 +1021,8 @@ __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { return _mm256_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); + __m128i test_mm_broadcast_i32x2(__m128i __A) { // CHECK-LABEL: test_mm_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> @@ -1029,6 +1037,8 @@ __m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) { return _mm_mask_broadcast_i32x2(__O, __M, __A); } +TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); + __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { // CHECK-LABEL: test_mm_maskz_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> @@ -1036,6 +1046,8 @@ __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return _mm_maskz_broadcast_i32x2(__M, __A); } +TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1)); + __m256i test_mm256_broadcast_i32x2(__m128i __A) { // CHECK-LABEL: test_mm256_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> @@ -1057,6 +1069,8 @@ __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return _mm256_maskz_broadcast_i32x2(__M, __A); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); + __m256i test_mm256_broadcast_i64x2(__m128i const* __A) { // CHECK-LABEL: test_mm256_broadcast_i64x2 // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> @@ -1078,6 +1092,8 @@ __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) { return _mm256_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); } +TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2)); + __m128d test_mm256_extractf64x2_pd(__m256d __A) { // CHECK-LABEL: test_mm256_extractf64x2_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> From ccc654a658febe990610d96b0f65e02c8a9511c9 Mon Sep 17 00:00:00 2001 From: ahmed Date: Thu, 9 Oct 2025 13:13:13 +0300 Subject: [PATCH 08/12] chore: Move test func from avx512f-builtins to corresponding test file --- clang/test/CodeGen/X86/avx512dq-builtins.c | 6 ++++++ clang/test/CodeGen/X86/avx512f-builtins.c | 11 ----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index d1e980601c462..d25921f852b95 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1329,6 +1329,8 @@ __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const* return _mm512_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); + __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f64x2 // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> @@ -1336,6 +1338,8 @@ __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { return _mm512_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); + __m512i test_mm512_broadcast_i32x2(__m128i __A) { // CHECK-LABEL: test_mm512_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> @@ -1379,6 +1383,8 @@ __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) { return _mm512_maskz_broadcast_i32x8(__M, _mm256_loadu_si256(__A)); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, _mm256_set1_epi32(9)), 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); + __m512i test_mm512_broadcast_i64x2(__m128i const* __A) { // CHECK-LABEL: test_mm512_broadcast_i64x2 // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index d06808631ab4e..011b677f0f182 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -11,17 +11,6 @@ #include #include "builtin_test_helpers.h" -// Constexpr coverage for DQ broadcast features (f64x2/i32x8) that don't have test functions in this file. -// The corresponding test_mm512_*_broadcast_* functions are in avx512dq-builtins.c. -TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, (__m128d)(__v2df){1,2}), - 1,2,1,2,1,2,1,2)); - -TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1,2}), - 1,2,1,2,1,2,1,2)); - -TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, _mm256_set1_epi32(9)), - 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); - __m512d test_mm512_sqrt_pd(__m512d a) { // CHECK-LABEL: test_mm512_sqrt_pd From f0f7f3f38edfa118ad45871173e6c0b42c3309bc Mon Sep 17 00:00:00 2001 From: ahmed Date: Wed, 15 Oct 2025 12:08:06 +0300 Subject: [PATCH 09/12] chore: PR Feedback --- clang/test/CodeGen/X86/avx512dq-builtins.c | 2 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index d25921f852b95..9d7966d6dc09a 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1305,7 +1305,7 @@ __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* _ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_broadcast_f32x8(__O, __M, _mm256_loadu_ps(__A)); } -TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xFFFF, _mm256_set1_ps(5.0f)), 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5)); +TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xAAAA, (__m256)(__v8sf){5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f}), 0,5,0,5,0,5,0,5,0,5,0,5,0,5,0,5)); __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f32x8 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 71bd5f4949611..5c7a00d9375ee 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7700,6 +7700,8 @@ __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { return _mm_maskz_unpacklo_pd(__U, __A, __B); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_unpacklo_pd(0x2, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 0.0,3.0)); + __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_unpacklo_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> @@ -7716,6 +7718,8 @@ __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { return _mm256_maskz_unpacklo_pd(__U, __A, __B); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_unpacklo_pd(0x0A, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0.0,5.0,0.0,7.0)); + __m128 test_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { // CHECK-LABEL: test_mm_mask_unpacklo_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> From df5211ee5ff9689e44ba96868cc6ed1554a74a60 Mon Sep 17 00:00:00 2001 From: ahmed Date: Thu, 16 Oct 2025 11:40:52 +0300 Subject: [PATCH 10/12] chore: add missing tests --- clang/test/CodeGen/X86/avx512dq-builtins.c | 4 ++++ clang/test/CodeGen/X86/avx512f-builtins.c | 4 ++++ clang/test/CodeGen/X86/avx512vl-builtins.c | 12 ++++++++++++ clang/test/CodeGen/X86/avx512vldq-builtins.c | 2 ++ 4 files changed, 22 insertions(+) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 9d7966d6dc09a..b0e1588dd3dc5 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1354,6 +1354,8 @@ __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) return _mm512_mask_broadcast_i32x2(__O, __M, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x2(_mm512_setzero_si512(), 0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1)); + __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> @@ -1361,6 +1363,8 @@ __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { return _mm512_maskz_broadcast_i32x2(__M, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x2(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1)); + __m512i test_mm512_broadcast_i32x8(__m256i const* __A) { // CHECK-LABEL: test_mm512_broadcast_i32x8 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 011b677f0f182..a4319aef20a9b 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -10934,6 +10934,8 @@ __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A) return _mm512_maskz_set1_epi32(__M, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_set1_epi32(0xAAAA, 19), 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0)); + __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59, char e58, char e57, char e56, char e55, char e54, char e53, char e52, @@ -11170,6 +11172,8 @@ __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) return _mm512_maskz_set1_epi64 (__M, __A); } +TEST_CONSTEXPR(match_v8di(_mm512_maskz_set1_epi64(0xAA, 23), 0, 23, 0, 23, 0, 23, 0, 23)); + __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C, long long __D, long long __E, long long __F, diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 5c7a00d9375ee..13d23fbe88c83 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7214,6 +7214,8 @@ __m128i test_mm_maskz_set1_epi32(__mmask8 __M) { return _mm_maskz_set1_epi32(__M, 5); } +TEST_CONSTEXPR(match_v4si(_mm_maskz_set1_epi32(0xA, 11), 11, 0, 11, 0)); + __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) { // CHECK-LABEL: test_mm256_mask_set1_epi32 // CHECK: insertelement <8 x i32> poison, i32 %{{.*}}, i32 0 @@ -7244,6 +7246,8 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { return _mm256_maskz_set1_epi32(__M, 5); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_set1_epi32(0xAA, 13), 13, 0, 13, 0, 13, 0, 13, 0)); + __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: test_mm_mask_set1_epi64 // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0 @@ -7264,6 +7268,8 @@ __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) { return _mm_maskz_set1_epi64(__M, __A); } +TEST_CONSTEXPR(match_v2di(_mm_maskz_set1_epi64(0x2, 15), 0, 15)); + __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: test_mm256_mask_set1_epi64 // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0 @@ -7288,6 +7294,8 @@ __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { return _mm256_maskz_set1_epi64(__M, __A); } +TEST_CONSTEXPR(match_v4di(_mm256_maskz_set1_epi64(0xA, 17), 0, 17, 0, 17)); + __m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) { // CHECK-LABEL: test_mm_fixupimm_pd // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128 @@ -8059,6 +8067,8 @@ __m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m1 return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v4si(_mm_mask_unpackhi_epi32(_mm_setzero_si128(), 0xA, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,2,0,3)); + __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpackhi_epi32 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> @@ -8066,6 +8076,8 @@ __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return _mm_maskz_unpackhi_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v4si(_mm_maskz_unpackhi_epi32(0x5, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,2,0,7)); + __m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpackhi_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c index a8708c5fe4349..6cefa73a9aba0 100644 --- a/clang/test/CodeGen/X86/avx512vldq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c @@ -1062,6 +1062,8 @@ __m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) return _mm256_mask_broadcast_i32x2(__O, __M, __A); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_broadcast_i32x2(_mm256_setzero_si256(), 0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); + __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { // CHECK-LABEL: test_mm256_maskz_broadcast_i32x2 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> From 89ad0c344bcf70a5b6891f9e11764ff97c7c0d43 Mon Sep 17 00:00:00 2001 From: ahmed Date: Thu, 16 Oct 2025 12:08:14 +0300 Subject: [PATCH 11/12] refactor: update tests --- clang/test/CodeGen/X86/avx512f-builtins.c | 2 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index a4319aef20a9b..a70ff048f1a32 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -10934,7 +10934,7 @@ __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A) return _mm512_maskz_set1_epi32(__M, __A); } -TEST_CONSTEXPR(match_v16si(_mm512_maskz_set1_epi32(0xAAAA, 19), 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0, 19, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_set1_epi32(0xAAAA, 19), 0,19,0,19,0,19,0,19,0,19,0,19,0,19,0,19)); __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59, diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 13d23fbe88c83..36b8d399d3647 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7214,7 +7214,7 @@ __m128i test_mm_maskz_set1_epi32(__mmask8 __M) { return _mm_maskz_set1_epi32(__M, 5); } -TEST_CONSTEXPR(match_v4si(_mm_maskz_set1_epi32(0xA, 11), 11, 0, 11, 0)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_set1_epi32(0xA, 11), 0, 11, 0, 11)); __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) { // CHECK-LABEL: test_mm256_mask_set1_epi32 @@ -7246,7 +7246,7 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { return _mm256_maskz_set1_epi32(__M, 5); } -TEST_CONSTEXPR(match_v8si(_mm256_maskz_set1_epi32(0xAA, 13), 13, 0, 13, 0, 13, 0, 13, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_set1_epi32(0xAA, 13), 0, 13, 0, 13, 0, 13, 0, 13)); __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { // CHECK-LABEL: test_mm_mask_set1_epi64 @@ -8067,7 +8067,7 @@ __m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m1 return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); } -TEST_CONSTEXPR(match_v4si(_mm_mask_unpackhi_epi32(_mm_setzero_si128(), 0xA, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,2,0,3)); +TEST_CONSTEXPR(match_v4si(_mm_mask_unpackhi_epi32(_mm_setzero_si128(), 0xA, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,6,0,7)); __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpackhi_epi32 @@ -8076,7 +8076,7 @@ __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return _mm_maskz_unpackhi_epi32(__U, __A, __B); } -TEST_CONSTEXPR(match_v4si(_mm_maskz_unpackhi_epi32(0x5, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,2,0,7)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_unpackhi_epi32(0x5, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 2,0,3,0)); __m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpackhi_epi32 From cfea576780511b404ca30e34636644bf79cb0948 Mon Sep 17 00:00:00 2001 From: ahmed Date: Thu, 16 Oct 2025 12:14:35 +0300 Subject: [PATCH 12/12] feat: Use partial masks --- clang/test/CodeGen/X86/avx512dq-builtins.c | 10 +++++----- clang/test/CodeGen/X86/avx512f-builtins.c | 10 +++++----- clang/test/CodeGen/X86/avx512vl-builtins.c | 14 +++++++------- clang/test/CodeGen/X86/avx512vldq-builtins.c | 10 +++++----- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index b0e1588dd3dc5..1b099594c88fa 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1313,7 +1313,7 @@ __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_broadcast_f32x8(__M, _mm256_loadu_ps(__A)); } -TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xFFFF, _mm256_set1_ps(7.0f)), 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7)); +TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xAAAA, _mm256_set1_ps(7.0f)), 0,7,0,7,0,7,0,7,0,7,0,7,0,7,0,7)); __m512d test_mm512_broadcast_f64x2(double const* __A) { // CHECK-LABEL: test_mm512_broadcast_f64x2 @@ -1329,7 +1329,7 @@ __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const* return _mm512_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); } -TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); +TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2)); __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f64x2 @@ -1338,7 +1338,7 @@ __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { return _mm512_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); } -TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1,2}), 1,2,1,2,1,2,1,2)); +TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2)); __m512i test_mm512_broadcast_i32x2(__m128i __A) { // CHECK-LABEL: test_mm512_broadcast_i32x2 @@ -1378,7 +1378,7 @@ __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i cons // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_broadcast_i32x8(__O, __M, _mm256_loadu_si256(__A)); } -TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xFFFF, _mm256_set1_epi32(8)), 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8)); +TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xAAAA, _mm256_set1_epi32(8)), 0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8)); __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_i32x8 @@ -1387,7 +1387,7 @@ __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) { return _mm512_maskz_broadcast_i32x8(__M, _mm256_loadu_si256(__A)); } -TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xFFFF, _mm256_set1_epi32(9)), 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xAAAA, _mm256_set1_epi32(9)), 0,9,0,9,0,9,0,9,0,9,0,9,0,9,0,9)); __m512i test_mm512_broadcast_i64x2(__m128i const* __A) { // CHECK-LABEL: test_mm512_broadcast_i64x2 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index a70ff048f1a32..3deaf8efc9632 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6836,7 +6836,7 @@ __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* _ return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A)); } -TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xFFFF, (__m128)(__v4sf){1,2,3,4}), 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); +TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4)); __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) { // CHECK-LABEL: test_mm512_maskz_broadcast_f32x4 @@ -6845,7 +6845,7 @@ __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) { return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); } -TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xFFFF, (__m128)(__v4sf){1,2,3,4}), 1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4)); +TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4)); __m512d test_mm512_broadcast_f64x4(double const* __A) { // CHECK-LABEL: test_mm512_broadcast_f64x4 @@ -6889,7 +6889,7 @@ __m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i const* __A) { return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); } -TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xFFFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3,0,1,0,3,0,1,0,3)); __m512i test_mm512_broadcast_i64x4(__m256i const* __A) { // CHECK-LABEL: test_mm512_broadcast_i64x4 @@ -10909,7 +10909,7 @@ __m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) return _mm512_mask_set1_epi32 ( __O, __M, __A); } -TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xFFFF, 13), 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13)); +TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xAAAA, 13), 0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13)); __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A) { @@ -11155,7 +11155,7 @@ __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) return _mm512_mask_set1_epi64 (__O, __M, __A); } -TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xFF, 21), 21,21,21,21,21,21,21,21)); +TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xAA, 21), 0,21,0,21,0,21,0,21)); __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) { diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 36b8d399d3647..9b6bfea918191 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -7230,7 +7230,7 @@ __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) { return _mm256_mask_set1_epi32(__O, __M, 5); } -TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xFF, 5), 5, 5, 5, 5, 5, 5, 5, 5)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xAA, 5), 0, 5, 0, 5, 0, 5, 0, 5)); __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) { // CHECK-LABEL: test_mm256_maskz_set1_epi32 @@ -7655,7 +7655,7 @@ __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2 return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); } -TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 2.0,6.0,4.0,8.0)); +TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,6.0,0,8.0)); __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_unpackhi_pd @@ -7717,7 +7717,7 @@ __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2 return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); } -TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xFF, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 1.0,5.0,3.0,7.0)); +TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,5.0,0,7.0)); __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_unpacklo_pd @@ -9029,7 +9029,7 @@ __m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { return _mm256_maskz_broadcast_f32x4(__M, __A); } -TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xFF, (__m128)(__v4sf){0,1,2,3}), 0,1,2,3,0,1,2,3)); +TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xAA, (__m128)(__v4sf){0,1,2,3}), 0,1,0,3,0,1,0,3)); __m256i test_mm256_broadcast_i32x4(__m128i const* __A) { // CHECK-LABEL: test_mm256_broadcast_i32x4 @@ -9052,7 +9052,7 @@ __m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i const* __A) { return _mm256_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); } -TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,2,3,0,1,2,3)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3)); __m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) { // CHECK-LABEL: test_mm256_mask_broadcastsd_pd @@ -10376,7 +10376,7 @@ __m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return _mm256_mask_movehdup_ps(__W, __U, __A); } -TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 2,2,4,4,6,6,8,8)); +TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,2,0,4,0,6,0,8)); __m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_movehdup_ps @@ -10408,7 +10408,7 @@ __m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return _mm256_mask_moveldup_ps(__W, __U, __A); } -TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xFF, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 1,1,3,3,5,5,7,7)); +TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,1,0,3,0,5,0,7)); __m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_moveldup_ps diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c index 6cefa73a9aba0..4773b60986169 100644 --- a/clang/test/CodeGen/X86/avx512vldq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c @@ -987,7 +987,7 @@ __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) { return _mm256_mask_broadcast_f32x2(__O, __M, __A); } -TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); +TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f)); __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { // CHECK-LABEL: test_mm256_maskz_broadcast_f32x2 @@ -996,7 +996,7 @@ __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { return _mm256_maskz_broadcast_f32x2(__M, __A); } -TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xFF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,2.f,1.f,2.f,1.f,2.f,1.f,2.f)); +TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f)); __m256d test_mm256_broadcast_f64x2(double const* __A) { // CHECK-LABEL: test_mm256_broadcast_f64x2 @@ -1012,7 +1012,7 @@ __m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, double const* return _mm256_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); } -TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); +TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0)); __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { // CHECK-LABEL: test_mm256_maskz_broadcast_f64x2 @@ -1021,7 +1021,7 @@ __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) { return _mm256_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); } -TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xFF, (__m128d)(__v2df){1.0,2.0}), 1.0,2.0,1.0,2.0)); +TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0)); __m128i test_mm_broadcast_i32x2(__m128i __A) { // CHECK-LABEL: test_mm_broadcast_i32x2 @@ -1071,7 +1071,7 @@ __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return _mm256_maskz_broadcast_i32x2(__M, __A); } -TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xFF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1)); __m256i test_mm256_broadcast_i64x2(__m128i const* __A) { // CHECK-LABEL: test_mm256_broadcast_i64x2