Skip to content

Commit 44e6bc6

Browse files
TianYe717RKSimon
andauthored
[Headers][X86] Allow AVX2/AVX512 broadcast intrinsics to be used in Constexpr (#153363)
Fix [issue](#152499) This patch adds support for the following broadcast intrinsics by wrapping them around existing generic shuffle implementations: ``` _mm_broadcastb_epi8 _mm_broadcastw_epi16 _mm_broadcastd_epi32 _mm_broadcastq_epi64 _mm_broadcastss_ps _mm_broadcastsd_pd _mm256_broadcastb_epi8 _mm256_broadcastw_epi16 _mm256_broadcastd_epi32 _mm256_broadcastq_epi64 _mm256_broadcastss_ps _mm256_broadcastsd_pd _mm256_broadcastsi128_si256 _mm512_broadcastb_epi8 _mm512_broadcastw_epi16 _mm512_broadcastd_epi32 _mm512_broadcastq_epi64 _mm512_broadcastss_ps _mm512_broadcastsd_pd _mm512_broadcast_f32x2 _mm256_broadcast_f32x2 _mm512_broadcast_i32x2 _mm256_broadcast_i32x2 _mm_broadcast_i32x2 _mm512_broadcast_f32x4 _mm256_broadcast_f32x4 _mm512_broadcast_i32x4 _mm256_broadcast_i32x4 _mm512_broadcast_f32x8 _mm512_broadcast_i32x8 _mm512_broadcast_f64x2 _mm256_broadcast_f64x2 _mm512_broadcast_i64x2 _mm256_broadcast_i64x2 _mm512_broadcast_f64x4 _mm512_broadcast_i64x4 ``` Co-authored-by: Simon Pilgrim <[email protected]>
1 parent b24b8a5 commit 44e6bc6

12 files changed

+130
-109
lines changed

clang/lib/Headers/avx2intrin.h

Lines changed: 26 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2989,9 +2989,8 @@ _mm256_stream_load_si256(const void *__V)
29892989
/// \param __X
29902990
/// A 128-bit vector of [4 x float] whose low element will be broadcast.
29912991
/// \returns A 128-bit vector of [4 x float] containing the result.
2992-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
2993-
_mm_broadcastss_ps(__m128 __X)
2994-
{
2992+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
2993+
_mm_broadcastss_ps(__m128 __X) {
29952994
return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
29962995
}
29972996

@@ -3006,9 +3005,8 @@ _mm_broadcastss_ps(__m128 __X)
30063005
/// \param __a
30073006
/// A 128-bit vector of [2 x double] whose low element will be broadcast.
30083007
/// \returns A 128-bit vector of [2 x double] containing the result.
3009-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
3010-
_mm_broadcastsd_pd(__m128d __a)
3011-
{
3008+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
3009+
_mm_broadcastsd_pd(__m128d __a) {
30123010
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
30133011
}
30143012

@@ -3023,9 +3021,8 @@ _mm_broadcastsd_pd(__m128d __a)
30233021
/// \param __X
30243022
/// A 128-bit vector of [4 x float] whose low element will be broadcast.
30253023
/// \returns A 256-bit vector of [8 x float] containing the result.
3026-
static __inline__ __m256 __DEFAULT_FN_ATTRS256
3027-
_mm256_broadcastss_ps(__m128 __X)
3028-
{
3024+
static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
3025+
_mm256_broadcastss_ps(__m128 __X) {
30293026
return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
30303027
}
30313028

@@ -3040,9 +3037,8 @@ _mm256_broadcastss_ps(__m128 __X)
30403037
/// \param __X
30413038
/// A 128-bit vector of [2 x double] whose low element will be broadcast.
30423039
/// \returns A 256-bit vector of [4 x double] containing the result.
3043-
static __inline__ __m256d __DEFAULT_FN_ATTRS256
3044-
_mm256_broadcastsd_pd(__m128d __X)
3045-
{
3040+
static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
3041+
_mm256_broadcastsd_pd(__m128d __X) {
30463042
return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
30473043
}
30483044

@@ -3056,9 +3052,8 @@ _mm256_broadcastsd_pd(__m128d __X)
30563052
/// \param __X
30573053
/// A 128-bit integer vector to be broadcast.
30583054
/// \returns A 256-bit integer vector containing the result.
3059-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3060-
_mm256_broadcastsi128_si256(__m128i __X)
3061-
{
3055+
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3056+
_mm256_broadcastsi128_si256(__m128i __X) {
30623057
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
30633058
}
30643059

@@ -3148,9 +3143,8 @@ _mm256_broadcastsi128_si256(__m128i __X)
31483143
/// \param __X
31493144
/// A 128-bit integer vector whose low byte will be broadcast.
31503145
/// \returns A 256-bit integer vector containing the result.
3151-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3152-
_mm256_broadcastb_epi8(__m128i __X)
3153-
{
3146+
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3147+
_mm256_broadcastb_epi8(__m128i __X) {
31543148
return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31553149
}
31563150

@@ -3164,9 +3158,8 @@ _mm256_broadcastb_epi8(__m128i __X)
31643158
/// \param __X
31653159
/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
31663160
/// \returns A 256-bit vector of [16 x i16] containing the result.
3167-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3168-
_mm256_broadcastw_epi16(__m128i __X)
3169-
{
3161+
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3162+
_mm256_broadcastw_epi16(__m128i __X) {
31703163
return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31713164
}
31723165

@@ -3180,9 +3173,8 @@ _mm256_broadcastw_epi16(__m128i __X)
31803173
/// \param __X
31813174
/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
31823175
/// \returns A 256-bit vector of [8 x i32] containing the result.
3183-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3184-
_mm256_broadcastd_epi32(__m128i __X)
3185-
{
3176+
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3177+
_mm256_broadcastd_epi32(__m128i __X) {
31863178
return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
31873179
}
31883180

@@ -3196,9 +3188,8 @@ _mm256_broadcastd_epi32(__m128i __X)
31963188
/// \param __X
31973189
/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
31983190
/// \returns A 256-bit vector of [4 x i64] containing the result.
3199-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
3200-
_mm256_broadcastq_epi64(__m128i __X)
3201-
{
3191+
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
3192+
_mm256_broadcastq_epi64(__m128i __X) {
32023193
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
32033194
}
32043195

@@ -3212,9 +3203,8 @@ _mm256_broadcastq_epi64(__m128i __X)
32123203
/// \param __X
32133204
/// A 128-bit integer vector whose low byte will be broadcast.
32143205
/// \returns A 128-bit integer vector containing the result.
3215-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3216-
_mm_broadcastb_epi8(__m128i __X)
3217-
{
3206+
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3207+
_mm_broadcastb_epi8(__m128i __X) {
32183208
return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
32193209
}
32203210

@@ -3228,9 +3218,8 @@ _mm_broadcastb_epi8(__m128i __X)
32283218
/// \param __X
32293219
/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
32303220
/// \returns A 128-bit vector of [8 x i16] containing the result.
3231-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3232-
_mm_broadcastw_epi16(__m128i __X)
3233-
{
3221+
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3222+
_mm_broadcastw_epi16(__m128i __X) {
32343223
return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
32353224
}
32363225

@@ -3244,9 +3233,8 @@ _mm_broadcastw_epi16(__m128i __X)
32443233
/// \param __X
32453234
/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
32463235
/// \returns A 128-bit vector of [4 x i32] containing the result.
3247-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3248-
_mm_broadcastd_epi32(__m128i __X)
3249-
{
3236+
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3237+
_mm_broadcastd_epi32(__m128i __X) {
32503238
return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
32513239
}
32523240

@@ -3260,9 +3248,8 @@ _mm_broadcastd_epi32(__m128i __X)
32603248
/// \param __X
32613249
/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
32623250
/// \returns A 128-bit vector of [2 x i64] containing the result.
3263-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
3264-
_mm_broadcastq_epi64(__m128i __X)
3265-
{
3251+
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
3252+
_mm_broadcastq_epi64(__m128i __X) {
32663253
return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
32673254
}
32683255

clang/lib/Headers/avx512bwintrin.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,9 +1881,8 @@ _mm512_movm_epi16 (__mmask32 __A)
18811881
return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
18821882
}
18831883

1884-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
1885-
_mm512_broadcastb_epi8 (__m128i __A)
1886-
{
1884+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
1885+
_mm512_broadcastb_epi8(__m128i __A) {
18871886
return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
18881887
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18891888
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -1923,9 +1922,8 @@ _mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
19231922
(__v32hi) _mm512_setzero_si512());
19241923
}
19251924

1926-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
1927-
_mm512_broadcastw_epi16 (__m128i __A)
1928-
{
1925+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
1926+
_mm512_broadcastw_epi16(__m128i __A) {
19291927
return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
19301928
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19311929
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);

clang/lib/Headers/avx512dqintrin.h

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,10 +1084,8 @@ _mm512_movepi64_mask (__m512i __A)
10841084
return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
10851085
}
10861086

1087-
1088-
static __inline__ __m512 __DEFAULT_FN_ATTRS512
1089-
_mm512_broadcast_f32x2 (__m128 __A)
1090-
{
1087+
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
1088+
_mm512_broadcast_f32x2(__m128 __A) {
10911089
return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
10921090
0, 1, 0, 1, 0, 1, 0, 1,
10931091
0, 1, 0, 1, 0, 1, 0, 1);
@@ -1109,9 +1107,8 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
11091107
(__v16sf)_mm512_setzero_ps());
11101108
}
11111109

1112-
static __inline__ __m512 __DEFAULT_FN_ATTRS512
1113-
_mm512_broadcast_f32x8(__m256 __A)
1114-
{
1110+
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
1111+
_mm512_broadcast_f32x8(__m256 __A) {
11151112
return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
11161113
0, 1, 2, 3, 4, 5, 6, 7,
11171114
0, 1, 2, 3, 4, 5, 6, 7);
@@ -1133,9 +1130,8 @@ _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
11331130
(__v16sf)_mm512_setzero_ps());
11341131
}
11351132

1136-
static __inline__ __m512d __DEFAULT_FN_ATTRS512
1137-
_mm512_broadcast_f64x2(__m128d __A)
1138-
{
1133+
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
1134+
_mm512_broadcast_f64x2(__m128d __A) {
11391135
return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
11401136
0, 1, 0, 1, 0, 1, 0, 1);
11411137
}
@@ -1156,9 +1152,8 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
11561152
(__v8df)_mm512_setzero_pd());
11571153
}
11581154

1159-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
1160-
_mm512_broadcast_i32x2 (__m128i __A)
1161-
{
1155+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
1156+
_mm512_broadcast_i32x2(__m128i __A) {
11621157
return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
11631158
0, 1, 0, 1, 0, 1, 0, 1,
11641159
0, 1, 0, 1, 0, 1, 0, 1);
@@ -1180,9 +1175,8 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
11801175
(__v16si)_mm512_setzero_si512());
11811176
}
11821177

1183-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
1184-
_mm512_broadcast_i32x8(__m256i __A)
1185-
{
1178+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
1179+
_mm512_broadcast_i32x8(__m256i __A) {
11861180
return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
11871181
0, 1, 2, 3, 4, 5, 6, 7,
11881182
0, 1, 2, 3, 4, 5, 6, 7);
@@ -1204,9 +1198,8 @@ _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
12041198
(__v16si)_mm512_setzero_si512());
12051199
}
12061200

1207-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
1208-
_mm512_broadcast_i64x2(__m128i __A)
1209-
{
1201+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
1202+
_mm512_broadcast_i64x2(__m128i __A) {
12101203
return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
12111204
0, 1, 0, 1, 0, 1, 0, 1);
12121205
}

clang/lib/Headers/avx512fintrin.h

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,8 @@ _mm512_undefined_epi32(void)
218218
return (__m512i)__builtin_ia32_undef512();
219219
}
220220

221-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
222-
_mm512_broadcastd_epi32 (__m128i __A)
223-
{
221+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
222+
_mm512_broadcastd_epi32(__m128i __A) {
224223
return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
225224
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
226225
}
@@ -241,9 +240,8 @@ _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
241240
(__v16si) _mm512_setzero_si512());
242241
}
243242

244-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
245-
_mm512_broadcastq_epi64 (__m128i __A)
246-
{
243+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
244+
_mm512_broadcastq_epi64(__m128i __A) {
247245
return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
248246
0, 0, 0, 0, 0, 0, 0, 0);
249247
}
@@ -344,9 +342,8 @@ _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
344342
(__v8di)_mm512_setzero_si512());
345343
}
346344

347-
static __inline__ __m512 __DEFAULT_FN_ATTRS512
348-
_mm512_broadcastss_ps(__m128 __A)
349-
{
345+
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
346+
_mm512_broadcastss_ps(__m128 __A) {
350347
return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
351348
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
352349
}
@@ -389,9 +386,8 @@ _mm512_set4_ps(float __A, float __B, float __C, float __D) {
389386
#define _mm512_setr4_ps(e0,e1,e2,e3) \
390387
_mm512_set4_ps((e3),(e2),(e1),(e0))
391388

392-
static __inline__ __m512d __DEFAULT_FN_ATTRS512
393-
_mm512_broadcastsd_pd(__m128d __A)
394-
{
389+
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
390+
_mm512_broadcastsd_pd(__m128d __A) {
395391
return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
396392
0, 0, 0, 0, 0, 0, 0, 0);
397393
}
@@ -6795,9 +6791,8 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
67956791
(__v4sf)_mm_setzero_ps(), \
67966792
(__mmask8)(U), (int)(R)))
67976793

6798-
static __inline__ __m512 __DEFAULT_FN_ATTRS512
6799-
_mm512_broadcast_f32x4(__m128 __A)
6800-
{
6794+
static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
6795+
_mm512_broadcast_f32x4(__m128 __A) {
68016796
return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
68026797
0, 1, 2, 3, 0, 1, 2, 3,
68036798
0, 1, 2, 3, 0, 1, 2, 3);
@@ -6819,9 +6814,8 @@ _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
68196814
(__v16sf)_mm512_setzero_ps());
68206815
}
68216816

6822-
static __inline__ __m512d __DEFAULT_FN_ATTRS512
6823-
_mm512_broadcast_f64x4(__m256d __A)
6824-
{
6817+
static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
6818+
_mm512_broadcast_f64x4(__m256d __A) {
68256819
return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
68266820
0, 1, 2, 3, 0, 1, 2, 3);
68276821
}
@@ -6842,9 +6836,8 @@ _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
68426836
(__v8df)_mm512_setzero_pd());
68436837
}
68446838

6845-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
6846-
_mm512_broadcast_i32x4(__m128i __A)
6847-
{
6839+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
6840+
_mm512_broadcast_i32x4(__m128i __A) {
68486841
return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
68496842
0, 1, 2, 3, 0, 1, 2, 3,
68506843
0, 1, 2, 3, 0, 1, 2, 3);
@@ -6866,9 +6859,8 @@ _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
68666859
(__v16si)_mm512_setzero_si512());
68676860
}
68686861

6869-
static __inline__ __m512i __DEFAULT_FN_ATTRS512
6870-
_mm512_broadcast_i64x4(__m256i __A)
6871-
{
6862+
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
6863+
_mm512_broadcast_i64x4(__m256i __A) {
68726864
return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
68736865
0, 1, 2, 3, 0, 1, 2, 3);
68746866
}

0 commit comments

Comments
 (0)