Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
0f0effd
feat: [Headers][X86] Allow AVX512 masked shuffles to be used in const…
ahmednour25689 Oct 7, 2025
facdc7e
chor: apply formatting
ahmednour25689 Oct 7, 2025
d9488e5
refactor: update tests
ahmednour25689 Oct 7, 2025
c9472b6
refactor: update test
ahmednour25689 Oct 7, 2025
1fcea1b
refactor: PR Feedback
ahmednour25689 Oct 7, 2025
39619bf
chore: fix formatting
ahmednour25689 Oct 7, 2025
27759e2
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 7, 2025
fcf5e8b
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 8, 2025
b4101ff
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 9, 2025
539b86e
chore: Move tests from avx512vl-builtins
ahmednour25689 Oct 9, 2025
ccc654a
chore: Move test func from avx512f-builtins to corresponding test file
ahmednour25689 Oct 9, 2025
3c88c39
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 9, 2025
c2df2f4
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 10, 2025
467f6ef
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 12, 2025
1e989b0
Merge branch 'main' into an/avx-masked-shuffles
RKSimon Oct 14, 2025
f0f7f3f
chore: PR Feedback
ahmednour25689 Oct 15, 2025
505dfd3
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 15, 2025
df5211e
chore: add missing tests
ahmednour25689 Oct 16, 2025
89ad0c3
refactor: update tests
ahmednour25689 Oct 16, 2025
cfea576
feat: Use partial masks
ahmednour25689 Oct 16, 2025
1c01f28
Merge branch 'main' into an/avx-masked-shuffles
ahmednour25689 Oct 16, 2025
b5d9eea
Merge branch 'main' into an/avx-masked-shuffles
RKSimon Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 24 additions & 36 deletions clang/lib/Headers/avx512dqintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1083,17 +1083,15 @@ _mm512_broadcast_f32x2(__m128 __A) {
0, 1, 0, 1, 0, 1, 0, 1);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x2(__A),
(__v16sf)__O);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x2(__A),
(__v16sf)_mm512_setzero_ps());
Expand All @@ -1106,17 +1104,15 @@ _mm512_broadcast_f32x8(__m256 __A) {
0, 1, 2, 3, 4, 5, 6, 7);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x8(__A),
(__v16sf)__O);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x8(__A),
(__v16sf)_mm512_setzero_ps());
Expand All @@ -1128,17 +1124,15 @@ _mm512_broadcast_f64x2(__m128d __A) {
0, 1, 0, 1, 0, 1, 0, 1);
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
{
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) {
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x2(__A),
(__v8df)__O);
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
{
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x2(__A),
(__v8df)_mm512_setzero_pd());
Expand All @@ -1151,17 +1145,15 @@ _mm512_broadcast_i32x2(__m128i __A) {
0, 1, 0, 1, 0, 1, 0, 1);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x2(__A),
(__v16si)__O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x2(__A),
(__v16si)_mm512_setzero_si512());
Expand All @@ -1174,17 +1166,15 @@ _mm512_broadcast_i32x8(__m256i __A) {
0, 1, 2, 3, 4, 5, 6, 7);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x8(__A),
(__v16si)__O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x8(__A),
(__v16si)_mm512_setzero_si512());
Expand All @@ -1196,17 +1186,15 @@ _mm512_broadcast_i64x2(__m128i __A) {
0, 1, 0, 1, 0, 1, 0, 1);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x2(__A),
(__v8di)__O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x2(__A),
(__v8di)_mm512_setzero_si512());
Expand Down
126 changes: 50 additions & 76 deletions clang/lib/Headers/avx512fintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,17 +225,15 @@ _mm512_broadcastd_epi32(__m128i __A) {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512(__M,
(__v16si) _mm512_broadcastd_epi32(__A),
(__v16si) __O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512(__M,
(__v16si) _mm512_broadcastd_epi32(__A),
(__v16si) _mm512_setzero_si512());
Expand All @@ -247,18 +245,14 @@ _mm512_broadcastq_epi64(__m128i __A) {
0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di) _mm512_broadcastq_epi64(__A),
(__v8di) __O);

static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectq_512(
__M, (__v8di)_mm512_broadcastq_epi64(__A), (__v8di)__O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di) _mm512_broadcastq_epi64(__A),
(__v8di) _mm512_setzero_si512());
Expand Down Expand Up @@ -321,9 +315,8 @@ _mm512_set1_epi32(int __s)
__s, __s, __s, __s, __s, __s, __s, __s };
}

static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
{
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_set1_epi32(__mmask16 __M, int __A) {
return (__m512i)__builtin_ia32_selectd_512(__M,
(__v16si)_mm512_set1_epi32(__A),
(__v16si)_mm512_setzero_si512());
Expand All @@ -335,9 +328,8 @@ _mm512_set1_epi64(long long __d)
return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
}

static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
{
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) {
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_set1_epi64(__A),
(__v8di)_mm512_setzero_si512());
Expand Down Expand Up @@ -6552,17 +6544,15 @@ _mm512_broadcast_f32x4(__m128 __A) {
0, 1, 2, 3, 0, 1, 2, 3);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x4(__A),
(__v16sf)__O);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x4(__A),
(__v16sf)_mm512_setzero_ps());
Expand Down Expand Up @@ -6597,17 +6587,15 @@ _mm512_broadcast_i32x4(__m128i __A) {
0, 1, 2, 3, 0, 1, 2, 3);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x4(__A),
(__v16si)__O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) {
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x4(__A),
(__v16si)_mm512_setzero_si512());
Expand Down Expand Up @@ -6635,33 +6623,29 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
(__v8di)_mm512_setzero_si512());
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
{
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
return (__m512d)__builtin_ia32_selectpd_512(__M,
(__v8df) _mm512_broadcastsd_pd(__A),
(__v8df) __O);
}

static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
{
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
return (__m512d)__builtin_ia32_selectpd_512(__M,
(__v8df) _mm512_broadcastsd_pd(__A),
(__v8df) _mm512_setzero_pd());
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512(__M,
(__v16sf) _mm512_broadcastss_ps(__A),
(__v16sf) __O);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
return (__m512)__builtin_ia32_selectps_512(__M,
(__v16sf) _mm512_broadcastss_ps(__A),
(__v16sf) _mm512_setzero_ps());
Expand Down Expand Up @@ -8381,17 +8365,15 @@ _mm512_movehdup_ps (__m512 __A)
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_movehdup_ps(__A),
(__v16sf)__W);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_movehdup_ps(__A),
(__v16sf)_mm512_setzero_ps());
Expand All @@ -8404,44 +8386,38 @@ _mm512_moveldup_ps (__m512 __A)
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_moveldup_ps(__A),
(__v16sf)__W);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
{
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_moveldup_ps(__A),
(__v16sf)_mm512_setzero_ps());
}

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
}

static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) {
return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
_mm_setzero_ps());
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) {
return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
_mm_setzero_pd());
}
Expand Down Expand Up @@ -8884,17 +8860,15 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
}
#endif

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) {
return (__m512i) __builtin_ia32_selectd_512(__M,
(__v16si) _mm512_set1_epi32(__A),
(__v16si) __O);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) {
return (__m512i) __builtin_ia32_selectq_512(__M,
(__v8di) _mm512_set1_epi64(__A),
(__v8di) __O);
Expand Down
Loading