-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[Headers][X86] Convert bf16 to f32 conversions to constexpr implementations #169841
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFixes #154911 Full diff: https://github.com/llvm/llvm-project/pull/169841.diff 4 Files Affected:
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 9a21d8680045a..458d1f8b993ba 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -25,6 +25,14 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
/// Convert One BF16 Data to One Single Float Data.
///
/// \headerfile <x86intrin.h>
@@ -35,7 +43,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsbh_ss(__bf16 __A) {
return (float)(__A);
}
@@ -235,7 +243,8 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_cvtpbh_ps(__m256bh __A) {
return (__m512) __builtin_convertvector(__A, __v16sf);
}
@@ -249,7 +258,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_cvtpbh_ps(__A),
@@ -268,14 +277,16 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return (__m512)__builtin_ia32_selectps_512(
(__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
}
#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
#endif
#endif
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 0e3184554a6c8..8543402065d76 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -24,6 +24,14 @@
__target__("avx512vl,avx512bf16"), \
__min_vector_width__(256)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
@@ -421,7 +429,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtpbh_ps(__m128bh __A) {
return (__m128)_mm256_castps256_ps128(
(__m256) __builtin_convertvector(__A, __v8sf));
}
@@ -433,7 +442,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtpbh_ps(__m128bh __A) {
return (__m256) __builtin_convertvector(__A, __v8sf);
}
@@ -447,7 +457,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return (__m128)__builtin_ia32_selectps_128(
(__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
@@ -463,7 +473,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_cvtpbh_ps(__A),
@@ -483,7 +493,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
return (__m128)__builtin_ia32_selectps_128(
(__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
@@ -502,7 +512,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return (__m256)__builtin_ia32_selectps_256(
(__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
@@ -510,6 +520,8 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif
#endif
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index dfaae4e459f7e..556c662f13cc0 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -9,6 +9,7 @@
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
+#include "builtin_test_helpers.h"
float test_mm_cvtsbh_ss(__bf16 A) {
// CHECK-LABEL: test_mm_cvtsbh_ss
@@ -16,6 +17,7 @@ float test_mm_cvtsbh_ss(__bf16 A) {
// CHECK: ret float %{{.*}}
return _mm_cvtsbh_ss(A);
}
+TEST_CONSTEXPR(_mm_cvtsbh_ss(-1.0f) == -1.0f);
__m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
// CHECK-LABEL: test_mm512_cvtne2ps_pbh
@@ -82,6 +84,7 @@ __m512 test_mm512_cvtpbh_ps(__m256bh A) {
// CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
return _mm512_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m512(_mm512_cvtpbh_ps((__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f));
__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
@@ -89,6 +92,7 @@ __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m512(_mm512_maskz_cvtpbh_ps(0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 0.0f, -32.0f, 0.0f, -128.0f, -0.5f, 0.25f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f));
__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_mask_cvtpbh_ps
@@ -96,3 +100,4 @@ __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m512(_mm512_mask_cvtpbh_ps((__m512){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 99.0f, 99.0f, -8.0f, 99.0f, -32.0f, 99.0f, -128.0f, -0.5f, 0.25f, 99.0f, 99.0f, 2.0f, 99.0f, 0.0f));
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index 80afcd7a490db..a5adae0cbb935 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -9,6 +9,7 @@
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
// CHECK-LABEL: test_mm_cvtne2ps2bf16
@@ -160,12 +161,14 @@ __m128 test_mm_cvtpbh_ps(__m128bh A) {
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m128(_mm_cvtpbh_ps((__m128bh){-8.0f, 16.0f, -32.0f, 64.0f, -0.0f, 1.0f, -2.0f, 4.0f}), -8.0f, 16.0f, -32.0f, 64.0f));
__m256 test_mm256_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm256_cvtpbh_ps
// CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
return _mm256_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m256(_mm256_cvtpbh_ps((__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f));
__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_maskz_cvtpbh_ps
@@ -174,6 +177,7 @@ __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m128(_mm_maskz_cvtpbh_ps(0x01, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 0.0f, 0.0f, 0.0f));
__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
@@ -181,6 +185,7 @@ __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m256(_mm256_maskz_cvtpbh_ps(0x73, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 16.0f, -32.0f, 0.0f));
__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_mask_cvtpbh_ps
@@ -189,6 +194,7 @@ __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m128(_mm_mask_cvtpbh_ps((__m128){ 99.0f, 99.0f, 99.0f, 99.0f }, 0x03, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 99.0f, 99.0f));
__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_mask_cvtpbh_ps
@@ -196,3 +202,4 @@ __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m256(_mm256_mask_cvtpbh_ps((__m256){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0x37, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 99.0f, -8.0f, 16.0f, 99.0f, 99.0f));
|
|
@llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) ChangesFixes #154911 Full diff: https://github.com/llvm/llvm-project/pull/169841.diff 4 Files Affected:
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 9a21d8680045a..458d1f8b993ba 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -25,6 +25,14 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
/// Convert One BF16 Data to One Single Float Data.
///
/// \headerfile <x86intrin.h>
@@ -35,7 +43,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsbh_ss(__bf16 __A) {
return (float)(__A);
}
@@ -235,7 +243,8 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_cvtpbh_ps(__m256bh __A) {
return (__m512) __builtin_convertvector(__A, __v16sf);
}
@@ -249,7 +258,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_cvtpbh_ps(__A),
@@ -268,14 +277,16 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return (__m512)__builtin_ia32_selectps_512(
(__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
}
#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
#endif
#endif
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 0e3184554a6c8..8543402065d76 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -24,6 +24,14 @@
__target__("avx512vl,avx512bf16"), \
__min_vector_width__(256)))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
@@ -421,7 +429,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtpbh_ps(__m128bh __A) {
return (__m128)_mm256_castps256_ps128(
(__m256) __builtin_convertvector(__A, __v8sf));
}
@@ -433,7 +442,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtpbh_ps(__m128bh __A) {
return (__m256) __builtin_convertvector(__A, __v8sf);
}
@@ -447,7 +457,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return (__m128)__builtin_ia32_selectps_128(
(__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
@@ -463,7 +473,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_cvtpbh_ps(__A),
@@ -483,7 +493,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
return (__m128)__builtin_ia32_selectps_128(
(__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
@@ -502,7 +512,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return (__m256)__builtin_ia32_selectps_256(
(__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
@@ -510,6 +520,8 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
#endif
#endif
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index dfaae4e459f7e..556c662f13cc0 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -9,6 +9,7 @@
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
+#include "builtin_test_helpers.h"
float test_mm_cvtsbh_ss(__bf16 A) {
// CHECK-LABEL: test_mm_cvtsbh_ss
@@ -16,6 +17,7 @@ float test_mm_cvtsbh_ss(__bf16 A) {
// CHECK: ret float %{{.*}}
return _mm_cvtsbh_ss(A);
}
+TEST_CONSTEXPR(_mm_cvtsbh_ss(-1.0f) == -1.0f);
__m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
// CHECK-LABEL: test_mm512_cvtne2ps_pbh
@@ -82,6 +84,7 @@ __m512 test_mm512_cvtpbh_ps(__m256bh A) {
// CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
return _mm512_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m512(_mm512_cvtpbh_ps((__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f));
__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
@@ -89,6 +92,7 @@ __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m512(_mm512_maskz_cvtpbh_ps(0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 0.0f, -32.0f, 0.0f, -128.0f, -0.5f, 0.25f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f));
__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_mask_cvtpbh_ps
@@ -96,3 +100,4 @@ __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m512(_mm512_mask_cvtpbh_ps((__m512){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 99.0f, 99.0f, -8.0f, 99.0f, -32.0f, 99.0f, -128.0f, -0.5f, 0.25f, 99.0f, 99.0f, 2.0f, 99.0f, 0.0f));
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index 80afcd7a490db..a5adae0cbb935 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -9,6 +9,7 @@
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
// CHECK-LABEL: test_mm_cvtne2ps2bf16
@@ -160,12 +161,14 @@ __m128 test_mm_cvtpbh_ps(__m128bh A) {
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m128(_mm_cvtpbh_ps((__m128bh){-8.0f, 16.0f, -32.0f, 64.0f, -0.0f, 1.0f, -2.0f, 4.0f}), -8.0f, 16.0f, -32.0f, 64.0f));
__m256 test_mm256_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm256_cvtpbh_ps
// CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
return _mm256_cvtpbh_ps(A);
}
+TEST_CONSTEXPR(match_m256(_mm256_cvtpbh_ps((__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f));
__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_maskz_cvtpbh_ps
@@ -174,6 +177,7 @@ __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m128(_mm_maskz_cvtpbh_ps(0x01, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 0.0f, 0.0f, 0.0f));
__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
@@ -181,6 +185,7 @@ __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_cvtpbh_ps(M, A);
}
+TEST_CONSTEXPR(match_m256(_mm256_maskz_cvtpbh_ps(0x73, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 16.0f, -32.0f, 0.0f));
__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_mask_cvtpbh_ps
@@ -189,6 +194,7 @@ __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m128(_mm_mask_cvtpbh_ps((__m128){ 99.0f, 99.0f, 99.0f, 99.0f }, 0x03, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 99.0f, 99.0f));
__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_mask_cvtpbh_ps
@@ -196,3 +202,4 @@ __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_cvtpbh_ps(S, M, A);
}
+TEST_CONSTEXPR(match_m256(_mm256_mask_cvtpbh_ps((__m256){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0x37, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 99.0f, -8.0f, 16.0f, 99.0f, 99.0f));
|
| /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from | ||
| /// conversion of __B, and higher 256 bits come from conversion of __A. | ||
| static __inline__ __m512bh __DEFAULT_FN_ATTRS512 | ||
| _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why leave these 3 not constexpr?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm just looking at bf16 to f32 in this patch as this can be safely handled entirely generically
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I see the title now :)
| /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. | ||
| static __inline__ __m256bh __DEFAULT_FN_ATTRS512 | ||
| _mm512_cvtneps_pbh(__m512 __A) { | ||
| return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use __builtin_convertvector for this too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The docs indicate these always round to nearest even - we could just handle the builtins in constexpr - similar to how we handle PS2PH?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm hesitant to use __builtin_convertvector as although it looks like we're consistent with fp32 -> bf16 using round rmNearestTiesToEven in dag, I don't know the code well enough to be sure that we do this everywhere - do you think its ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When AVX512BF16 is enabled, it's lowered to vcvtneps2bf16, https://godbolt.org/z/d5snYdGn4. Otherwise, it's scalarized with libcall. Since we change for intrinsic, AVX512BF16 is guaranteed. I don't see any risk here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, it's simple than PS2PH since round to nearest even is the only rounding mode.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cheers - I'll create a ticket
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Fixes #154911