Skip to content

Conversation

@RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Nov 27, 2025

Fixes #154911

@RKSimon RKSimon requested a review from phoebewang November 27, 2025 17:37
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Nov 27, 2025
@llvmbot
Copy link
Member

llvmbot commented Nov 27, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Fixes #154911


Full diff: https://github.com/llvm/llvm-project/pull/169841.diff

4 Files Affected:

  • (modified) clang/lib/Headers/avx512bf16intrin.h (+15-4)
  • (modified) clang/lib/Headers/avx512vlbf16intrin.h (+18-6)
  • (modified) clang/test/CodeGen/X86/avx512bf16-builtins.c (+5)
  • (modified) clang/test/CodeGen/X86/avx512vlbf16-builtins.c (+7)
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 9a21d8680045a..458d1f8b993ba 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -25,6 +25,14 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /// Convert One BF16 Data to One Single Float Data.
 ///
 /// \headerfile <x86intrin.h>
@@ -35,7 +43,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 ///    A bfloat data.
 /// \returns A float data whose sign field and exponent field keep unchanged,
 ///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsbh_ss(__bf16 __A) {
   return (float)(__A);
 }
 
@@ -235,7 +243,8 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_cvtpbh_ps(__m256bh __A) {
   return (__m512) __builtin_convertvector(__A, __v16sf);
 }
 
@@ -249,7 +258,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_cvtpbh_ps(__A),
@@ -268,14 +277,16 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
   return (__m512)__builtin_ia32_selectps_512(
       (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
 
 #endif
 #endif
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 0e3184554a6c8..8543402065d76 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -24,6 +24,14 @@
                  __target__("avx512vl,avx512bf16"),                            \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
@@ -421,7 +429,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtpbh_ps(__m128bh __A) {
   return (__m128)_mm256_castps256_ps128(
       (__m256) __builtin_convertvector(__A, __v8sf));
 }
@@ -433,7 +442,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtpbh_ps(__m128bh __A) {
   return (__m256) __builtin_convertvector(__A, __v8sf);
 }
 
@@ -447,7 +457,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
   return (__m128)__builtin_ia32_selectps_128(
       (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
@@ -463,7 +473,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_cvtpbh_ps(__A),
@@ -483,7 +493,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
   return (__m128)__builtin_ia32_selectps_128(
       (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
@@ -502,7 +512,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
   return (__m256)__builtin_ia32_selectps_256(
       (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
@@ -510,6 +520,8 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif
 #endif
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index dfaae4e459f7e..556c662f13cc0 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -9,6 +9,7 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 float test_mm_cvtsbh_ss(__bf16 A) {
   // CHECK-LABEL: test_mm_cvtsbh_ss
@@ -16,6 +17,7 @@ float test_mm_cvtsbh_ss(__bf16 A) {
   // CHECK: ret float %{{.*}}
   return _mm_cvtsbh_ss(A);
 }
+TEST_CONSTEXPR(_mm_cvtsbh_ss(-1.0f) == -1.0f);
 
 __m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
   // CHECK-LABEL: test_mm512_cvtne2ps_pbh
@@ -82,6 +84,7 @@ __m512 test_mm512_cvtpbh_ps(__m256bh A) {
   // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
   return _mm512_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_cvtpbh_ps((__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f));
 
 __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
   // CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
@@ -89,6 +92,7 @@ __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_maskz_cvtpbh_ps(0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 0.0f, -32.0f, 0.0f, -128.0f, -0.5f, 0.25f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f));
 
 __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
   // CHECK-LABEL: test_mm512_mask_cvtpbh_ps
@@ -96,3 +100,4 @@ __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_mask_cvtpbh_ps((__m512){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 99.0f, 99.0f, -8.0f, 99.0f, -32.0f, 99.0f, -128.0f, -0.5f, 0.25f, 99.0f, 99.0f, 2.0f, 99.0f, 0.0f));
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index 80afcd7a490db..a5adae0cbb935 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -9,6 +9,7 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_cvtne2ps2bf16
@@ -160,12 +161,14 @@ __m128 test_mm_cvtpbh_ps(__m128bh A) {
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m128(_mm_cvtpbh_ps((__m128bh){-8.0f, 16.0f, -32.0f, 64.0f, -0.0f, 1.0f, -2.0f, 4.0f}), -8.0f, 16.0f, -32.0f, 64.0f));
 
 __m256 test_mm256_cvtpbh_ps(__m128bh A) {
   // CHECK-LABEL: test_mm256_cvtpbh_ps
   // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
   return _mm256_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_cvtpbh_ps((__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f));
 
 __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm_maskz_cvtpbh_ps
@@ -174,6 +177,7 @@ __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m128(_mm_maskz_cvtpbh_ps(0x01, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 0.0f, 0.0f, 0.0f));
 
 __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
@@ -181,6 +185,7 @@ __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_maskz_cvtpbh_ps(0x73, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 16.0f, -32.0f, 0.0f));
 
 __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm_mask_cvtpbh_ps
@@ -189,6 +194,7 @@ __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m128(_mm_mask_cvtpbh_ps((__m128){ 99.0f, 99.0f, 99.0f, 99.0f }, 0x03, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 99.0f, 99.0f));
 
 __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm256_mask_cvtpbh_ps
@@ -196,3 +202,4 @@ __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_mask_cvtpbh_ps((__m256){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0x37, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 99.0f, -8.0f, 16.0f, 99.0f, 99.0f));

@llvmbot
Copy link
Member

llvmbot commented Nov 27, 2025

@llvm/pr-subscribers-clang

Author: Simon Pilgrim (RKSimon)

Changes

Fixes #154911


Full diff: https://github.com/llvm/llvm-project/pull/169841.diff

4 Files Affected:

  • (modified) clang/lib/Headers/avx512bf16intrin.h (+15-4)
  • (modified) clang/lib/Headers/avx512vlbf16intrin.h (+18-6)
  • (modified) clang/test/CodeGen/X86/avx512bf16-builtins.c (+5)
  • (modified) clang/test/CodeGen/X86/avx512vlbf16-builtins.c (+7)
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 9a21d8680045a..458d1f8b993ba 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -25,6 +25,14 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /// Convert One BF16 Data to One Single Float Data.
 ///
 /// \headerfile <x86intrin.h>
@@ -35,7 +43,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 ///    A bfloat data.
 /// \returns A float data whose sign field and exponent field keep unchanged,
 ///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsbh_ss(__bf16 __A) {
   return (float)(__A);
 }
 
@@ -235,7 +243,8 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_cvtpbh_ps(__m256bh __A) {
   return (__m512) __builtin_convertvector(__A, __v16sf);
 }
 
@@ -249,7 +258,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_cvtpbh_ps(__A),
@@ -268,14 +277,16 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
   return (__m512)__builtin_ia32_selectps_512(
       (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
 }
 
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
 
 #endif
 #endif
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 0e3184554a6c8..8543402065d76 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -24,6 +24,14 @@
                  __target__("avx512vl,avx512bf16"),                            \
                  __min_vector_width__(256)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
@@ -421,7 +429,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtpbh_ps(__m128bh __A) {
   return (__m128)_mm256_castps256_ps128(
       (__m256) __builtin_convertvector(__A, __v8sf));
 }
@@ -433,7 +442,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtpbh_ps(__m128bh __A) {
   return (__m256) __builtin_convertvector(__A, __v8sf);
 }
 
@@ -447,7 +457,7 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
   return (__m128)__builtin_ia32_selectps_128(
       (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
@@ -463,7 +473,7 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_cvtpbh_ps(__A),
@@ -483,7 +493,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
   return (__m128)__builtin_ia32_selectps_128(
       (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
@@ -502,7 +512,7 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
   return (__m256)__builtin_ia32_selectps_256(
       (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
@@ -510,6 +520,8 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif
 #endif
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index dfaae4e459f7e..556c662f13cc0 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -9,6 +9,7 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 float test_mm_cvtsbh_ss(__bf16 A) {
   // CHECK-LABEL: test_mm_cvtsbh_ss
@@ -16,6 +17,7 @@ float test_mm_cvtsbh_ss(__bf16 A) {
   // CHECK: ret float %{{.*}}
   return _mm_cvtsbh_ss(A);
 }
+TEST_CONSTEXPR(_mm_cvtsbh_ss(-1.0f) == -1.0f);
 
 __m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
   // CHECK-LABEL: test_mm512_cvtne2ps_pbh
@@ -82,6 +84,7 @@ __m512 test_mm512_cvtpbh_ps(__m256bh A) {
   // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
   return _mm512_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_cvtpbh_ps((__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f));
 
 __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
   // CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
@@ -89,6 +92,7 @@ __m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_maskz_cvtpbh_ps(0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 0.0f, -32.0f, 0.0f, -128.0f, -0.5f, 0.25f, 0.0f, 0.0f, 2.0f, 0.0f, 0.0f));
 
 __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
   // CHECK-LABEL: test_mm512_mask_cvtpbh_ps
@@ -96,3 +100,4 @@ __m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m512(_mm512_mask_cvtpbh_ps((__m512){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0xA753, (__m256bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f, -128.0f, -0.5f, 0.25f, -0.125f, -4.0f, 2.0f, -1.0f, 0.0f}), -0.0f, 1.0f, 99.0f, 99.0f, -8.0f, 99.0f, -32.0f, 99.0f, -128.0f, -0.5f, 0.25f, 99.0f, 99.0f, 2.0f, 99.0f, 0.0f));
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index 80afcd7a490db..a5adae0cbb935 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -9,6 +9,7 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512bf16 -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_cvtne2ps2bf16
@@ -160,12 +161,14 @@ __m128 test_mm_cvtpbh_ps(__m128bh A) {
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m128(_mm_cvtpbh_ps((__m128bh){-8.0f, 16.0f, -32.0f, 64.0f, -0.0f, 1.0f, -2.0f, 4.0f}), -8.0f, 16.0f, -32.0f, 64.0f));
 
 __m256 test_mm256_cvtpbh_ps(__m128bh A) {
   // CHECK-LABEL: test_mm256_cvtpbh_ps
   // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
   return _mm256_cvtpbh_ps(A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_cvtpbh_ps((__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f));
 
 __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm_maskz_cvtpbh_ps
@@ -174,6 +177,7 @@ __m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m128(_mm_maskz_cvtpbh_ps(0x01, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 0.0f, 0.0f, 0.0f));
 
 __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
@@ -181,6 +185,7 @@ __m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_cvtpbh_ps(M, A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_maskz_cvtpbh_ps(0x73, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 0.0f, 0.0f, -8.0f, 16.0f, -32.0f, 0.0f));
 
 __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm_mask_cvtpbh_ps
@@ -189,6 +194,7 @@ __m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m128(_mm_mask_cvtpbh_ps((__m128){ 99.0f, 99.0f, 99.0f, 99.0f }, 0x03, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, 99.0f, 99.0f));
 
 __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
   // CHECK-LABEL: test_mm256_mask_cvtpbh_ps
@@ -196,3 +202,4 @@ __m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_cvtpbh_ps(S, M, A);
 }
+TEST_CONSTEXPR(match_m256(_mm256_mask_cvtpbh_ps((__m256){ 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f }, 0x37, (__m128bh){-0.0f, 1.0f, -2.0f, 4.0f, -8.0f, 16.0f, -32.0f, 64.0f}), -0.0f, 1.0f, -2.0f, 99.0f, -8.0f, 16.0f, 99.0f, 99.0f));

/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why leave these 3 not constexpr?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just looking at bf16 to f32 in this patch as this can be safely handled entirely generically

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see the title now :)

/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use __builtin_convertvector for this too?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs indicate these always round to nearest even - we could just handle the builtins in constexpr - similar to how we handle PS2PH?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm hesitant to use __builtin_convertvector as although it looks like we're consistent with fp32 -> bf16 using round rmNearestTiesToEven in dag, I don't know the code well enough to be sure that we do this everywhere - do you think its ok?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When AVX512BF16 is enabled, it's lowered to vcvtneps2bf16, https://godbolt.org/z/d5snYdGn4. Otherwise, it's scalarized with libcall. Since we change for intrinsic, AVX512BF16 is guaranteed. I don't see any risk here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's simple than PS2PH since round to nearest even is the only rounding mode.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cheers - I'll create a ticket

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon enabled auto-merge (squash) November 28, 2025 12:41
@RKSimon RKSimon merged commit 346f48e into llvm:main Nov 28, 2025
9 of 10 checks passed
@RKSimon RKSimon deleted the x86-bf16-constexpr branch November 28, 2025 13:10
aahrun pushed a commit to aahrun/llvm-project that referenced this pull request Dec 1, 2025
augusto2112 pushed a commit to augusto2112/llvm-project that referenced this pull request Dec 3, 2025
kcloudy0717 pushed a commit to kcloudy0717/llvm-project that referenced this pull request Dec 4, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[Headers][X86] Convert bf16 to f32 conversions to generic constexpr implementations

3 participants