Skip to content

Commit 59c01cc

Browse files
authored
[Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr (#154731)
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions Fixes #154555
1 parent 20034ba commit 59c01cc

File tree

6 files changed

+180
-172
lines changed

6 files changed

+180
-172
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -866,16 +866,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
866866
def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
867867
}
868868

869-
let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
870-
def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
871-
def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
872-
}
873-
874-
let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
875-
def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
876-
def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
877-
}
878-
879869
let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
880870
def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
881871
def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;

clang/lib/CodeGen/TargetBuiltins/X86.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
10281028
case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
10291029
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
10301030

1031-
case X86::BI__builtin_ia32_vfmaddss3:
1032-
case X86::BI__builtin_ia32_vfmaddsd3:
10331031
case X86::BI__builtin_ia32_vfmaddsh3_mask:
10341032
case X86::BI__builtin_ia32_vfmaddss3_mask:
10351033
case X86::BI__builtin_ia32_vfmaddsd3_mask:
10361034
return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
1037-
case X86::BI__builtin_ia32_vfmaddss:
1038-
case X86::BI__builtin_ia32_vfmaddsd:
1039-
return EmitScalarFMAExpr(*this, E, Ops,
1040-
Constant::getNullValue(Ops[0]->getType()));
10411035
case X86::BI__builtin_ia32_vfmaddsh3_maskz:
10421036
case X86::BI__builtin_ia32_vfmaddss3_maskz:
10431037
case X86::BI__builtin_ia32_vfmaddsd3_maskz:

clang/lib/Headers/fma4intrin.h

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,14 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
4040
(__v2df)__C);
4141
}
4242

43-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
44-
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
45-
{
46-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
43+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
44+
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) {
45+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
4746
}
4847

49-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
50-
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
51-
{
52-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
48+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
49+
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) {
50+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
5351
}
5452

5553
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +62,14 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
6462
-(__v2df)__C);
6563
}
6664

67-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
68-
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
69-
{
70-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
65+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
66+
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) {
67+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7168
}
7269

73-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
74-
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
75-
{
76-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
70+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
71+
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) {
72+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7773
}
7874

7975
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +84,14 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
8884
(__v2df)__C);
8985
}
9086

91-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
92-
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
93-
{
94-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
87+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
88+
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) {
89+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
9590
}
9691

97-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
98-
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
99-
{
100-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
92+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
93+
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) {
94+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
10195
}
10296

10397
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +106,14 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
112106
-(__v2df)__C);
113107
}
114108

115-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
116-
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
117-
{
118-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
109+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
110+
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
111+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
119112
}
120113

121-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
122-
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
123-
{
124-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
114+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
115+
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
116+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
125117
}
126118

127119
static __inline__ __m128 __DEFAULT_FN_ATTRS128

clang/lib/Headers/fmaintrin.h

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,10 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
9595
/// 32 bits.
9696
/// \returns A 128-bit vector of [4 x float] containing the result in the low
9797
/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
98-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
99-
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
100-
{
101-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
98+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
99+
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
100+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
101+
return __A;
102102
}
103103

104104
/// Computes a scalar multiply-add of the double-precision values in the
@@ -124,10 +124,10 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
124124
/// 64 bits.
125125
/// \returns A 128-bit vector of [2 x double] containing the result in the low
126126
/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
127-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
128-
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
129-
{
130-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
127+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
128+
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
129+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
130+
return __A;
131131
}
132132

133133
/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -195,10 +195,10 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
195195
/// 32 bits.
196196
/// \returns A 128-bit vector of [4 x float] containing the result in the low
197197
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
198-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
199-
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
200-
{
201-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
198+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
199+
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
200+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
201+
return __A;
202202
}
203203

204204
/// Computes a scalar multiply-subtract of the double-precision values in
@@ -224,10 +224,10 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
224224
/// 64 bits.
225225
/// \returns A 128-bit vector of [2 x double] containing the result in the low
226226
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
227-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
228-
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
229-
{
230-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
227+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
228+
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
229+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
230+
return __A;
231231
}
232232

233233
/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -295,10 +295,10 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
295295
/// 32 bits.
296296
/// \returns A 128-bit vector of [4 x float] containing the result in the low
297297
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
298-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
299-
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
300-
{
301-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
298+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
299+
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
300+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
301+
return __A;
302302
}
303303

304304
/// Computes a scalar negated multiply-add of the double-precision values
@@ -324,10 +324,10 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
324324
/// 64 bits.
325325
/// \returns A 128-bit vector of [2 x double] containing the result in the low
326326
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
327-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
328-
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
329-
{
330-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
327+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
328+
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
329+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
330+
return __A;
331331
}
332332

333333
/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -395,10 +395,10 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
395395
/// 32 bits.
396396
/// \returns A 128-bit vector of [4 x float] containing the result in the low
397397
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
398-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
399-
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
400-
{
401-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
398+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
399+
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
400+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
401+
return __A;
402402
}
403403

404404
/// Computes a scalar negated multiply-subtract of the double-precision
@@ -424,10 +424,10 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
424424
/// 64 bits.
425425
/// \returns A 128-bit vector of [2 x double] containing the result in the low
426426
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
427-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
428-
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
429-
{
430-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
427+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
428+
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
429+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
430+
return __A;
431431
}
432432

433433
/// Computes a multiply with alternating add/subtract of 128-bit vectors of

0 commit comments

Comments
 (0)