[Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr (#154731)

RKSimon · web-flow · commit 59c01cc8bb37 · 2025-11-13T11:36:22.000Z
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions Fixes #154555
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
@@ -866,16 +866,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
 }
 
-let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
-let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
-  def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
 let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
   def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
 
-  case X86::BI__builtin_ia32_vfmaddss3:
-  case X86::BI__builtin_ia32_vfmaddsd3:
   case X86::BI__builtin_ia32_vfmaddsh3_mask:
   case X86::BI__builtin_ia32_vfmaddss3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
     return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
-  case X86::BI__builtin_ia32_vfmaddss:
-  case X86::BI__builtin_ia32_vfmaddsd:
-    return EmitScalarFMAExpr(*this, E, Ops,
-                             Constant::getNullValue(Ops[0]->getType()));
   case X86::BI__builtin_ia32_vfmaddsh3_maskz:
   case X86::BI__builtin_ia32_vfmaddss3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
@@ -40,16 +40,14 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +62,14 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +84,14 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             (__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +106,14 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
                                             -(__v2df)__C);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
@@ -95,10 +95,10 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-add of the double-precision values in the
@@ -124,10 +124,10 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -195,10 +195,10 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///   32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar multiply-subtract of the double-precision values in
@@ -224,10 +224,10 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -295,10 +295,10 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-add of the double-precision values
@@ -324,10 +324,10 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+  return __A;
 }
 
 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -395,10 +395,10 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 ///    32 bits.
 /// \returns A 128-bit vector of [4 x float] containing the result in the low
 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a scalar negated multiply-subtract of the double-precision
@@ -424,10 +424,10 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 ///    64 bits.
 /// \returns A 128-bit vector of [2 x double] containing the result in the low
 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
+  __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+  return __A;
 }
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
diff --git a/clang/test/CodeGen/X86/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-builtins.c