[Headers][X86] Allow AVX cast intrinsics to be used in constexpr (#152730)

RKSimon · web-flow · commit e64224a224a9 · 2025-08-08T15:39:39.000+01:00
Still missing the "extend to 256-bit" casts - _mm256_castpd128_pd256 / _mm256_castps128_ps256 / _mm256_castsi128_si256 - due to constexpr not liking undefined/poison etc.
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
@@ -4367,7 +4367,7 @@ _mm256_setzero_si256(void) {
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
@@ -4384,7 +4384,7 @@ _mm256_castpd_ps(__m256d __a)
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
@@ -4401,7 +4401,7 @@ _mm256_castpd_si256(__m256d __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
@@ -4418,7 +4418,7 @@ _mm256_castps_pd(__m256 __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 256-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
@@ -4435,7 +4435,7 @@ _mm256_castps_si256(__m256 __a)
 ///    A 256-bit integer vector.
 /// \returns A 256-bit floating-point vector of [8 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
@@ -4452,7 +4452,7 @@ _mm256_castsi256_ps(__m256i __a)
 ///    A 256-bit integer vector.
 /// \returns A 256-bit floating-point vector of [4 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
@@ -4469,7 +4469,7 @@ _mm256_castsi256_pd(__m256i __a)
 ///    A 256-bit floating-point vector of [4 x double].
 /// \returns A 128-bit floating-point vector of [2 x double] containing the
 ///    lower 128 bits of the parameter.
-static __inline __m128d __DEFAULT_FN_ATTRS
+static __inline __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
@@ -4486,7 +4486,7 @@ _mm256_castpd256_pd128(__m256d __a)
 ///    A 256-bit floating-point vector of [8 x float].
 /// \returns A 128-bit floating-point vector of [4 x float] containing the
 ///    lower 128 bits of the parameter.
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
@@ -4502,7 +4502,7 @@ _mm256_castps256_ps128(__m256 __a)
 ///    A 256-bit integer vector.
 /// \returns A 128-bit integer vector containing the lower 128 bits of the
 ///    parameter.
-static __inline __m128i __DEFAULT_FN_ATTRS
+static __inline __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
@@ -147,11 +147,13 @@ __m256 test_mm256_castpd_ps(__m256d A) {
   // CHECK-LABEL: test_mm256_castpd_ps
   return _mm256_castpd_ps(A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_castpd_ps((__m256d){-1.0, +2.0, +4.0, -6.0}), +0.0f, -1.875f, +0.0f, +2.0f, +0.0f, +2.25f, 0.0f, -2.375f));
 
 __m256i test_mm256_castpd_si256(__m256d A) {
   // CHECK-LABEL: test_mm256_castpd_si256
   return _mm256_castpd_si256(A);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_castpd_si256((__m256d){-1.0, +2.0, -3.0, +4.0}), 0xBFF0000000000000ULL, 0x4000000000000000ULL, 0xC008000000000000ULL, 0x4010000000000000ULL));
 
 __m256d test_mm256_castpd128_pd256(__m128d A) {
   // CHECK-LABEL: test_mm256_castpd128_pd256
@@ -165,16 +167,19 @@ __m128d test_mm256_castpd256_pd128(__m256d A) {
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
   return _mm256_castpd256_pd128(A);
 }
+TEST_CONSTEXPR(match_m128d(_mm256_castpd256_pd128((__m256d){-1.0, +2.0, -3.0, +4.0}), -1.0, +2.0));
 
 __m256d test_mm256_castps_pd(__m256 A) {
   // CHECK-LABEL: test_mm256_castps_pd
   return _mm256_castps_pd(A);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_castps_pd((__m256){0.0f, -1.0f, 0.0f, 4.0f, 0.0f, -2.0f, 0.0f, 6.0f}), -0.0078125, 512.0, -2.0, +8192.0));
 
 __m256i test_mm256_castps_si256(__m256 A) {
   // CHECK-LABEL: test_mm256_castps_si256
   return _mm256_castps_si256(A);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_castps_si256((__m256){1.0f, -2.0f, -4.0f, 8.0f, -16.0f, +16.0f, +32.0f, -32.0f}), 0xC00000003F800000ULL, 0x41000000c0800000ULL, 0x41800000C1800000ULL, 0xC200000042000000ULL));
 
 __m256 test_mm256_castps128_ps256(__m128 A) {
   // CHECK-LABEL: test_mm256_castps128_ps256
@@ -188,6 +193,7 @@ __m128 test_mm256_castps256_ps128(__m256 A) {
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_castps256_ps128(A);
 }
+TEST_CONSTEXPR(match_m128(_mm256_castps256_ps128((__m256){1.0f, -2.0f, -4.0f, 8.0f, -16.0f, +16.0f, +32.0f, -32.0f}), 1.0f, -2.0f, -4.0f, 8.0f));
 
 __m256i test_mm256_castsi128_si256(__m128i A) {
   // CHECK-LABEL: test_mm256_castsi128_si256
@@ -200,17 +206,20 @@ __m256d test_mm256_castsi256_pd(__m256i A) {
   // CHECK-LABEL: test_mm256_castsi256_pd
   return _mm256_castsi256_pd(A);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_castsi256_pd((__m256i)(__v4du){0x4070000000000000ULL, 0xC000000000000000ULL, 0xBFF0000000000000ULL, 0xC008000000000000ULL}), 256.0, -2.0, -1.0, -3.0));
 
 __m256 test_mm256_castsi256_ps(__m256i A) {
   // CHECK-LABEL: test_mm256_castsi256_ps
   return _mm256_castsi256_ps(A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_castsi256_ps((__m256i)(__v4du){0x42000000c1800000ULL, 0x43000000c2800000ULL, 0x41000000c0800000ULL, 0xC00000003F800000ULL}), -16.0f, 32.0f, -64.0f, 128.0f, -4.0f, 8.0f, 1.0f, -2.0f));
 
 __m128i test_mm256_castsi256_si128(__m256i A) {
   // CHECK-LABEL: test_mm256_castsi256_si128
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
   return _mm256_castsi256_si128(A);
 }
+TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000000000ULL, 0x4070000000000000ULL, 0xC000000000000000ULL, 0xC008000000000000ULL}), 0xBFF0000000000000ULL, 0x4070000000000000ULL));
 
 __m256d test_mm256_ceil_pd(__m256d x) {
   // CHECK-LABEL: test_mm256_ceil_pd