Skip to content

Commit 3c9d5ad

Browse files
committed
[Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions
1 parent bd63d93 commit 3c9d5ad

File tree

6 files changed

+164
-140
lines changed

6 files changed

+164
-140
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -885,16 +885,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
885885
def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
886886
}
887887

888-
let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
889-
def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
890-
def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
891-
}
892-
893-
let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
894-
def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
895-
def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
896-
}
897-
898888
let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
899889
def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
900890
def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;

clang/lib/CodeGen/TargetBuiltins/X86.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
10281028
case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
10291029
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
10301030

1031-
case X86::BI__builtin_ia32_vfmaddss3:
1032-
case X86::BI__builtin_ia32_vfmaddsd3:
10331031
case X86::BI__builtin_ia32_vfmaddsh3_mask:
10341032
case X86::BI__builtin_ia32_vfmaddss3_mask:
10351033
case X86::BI__builtin_ia32_vfmaddsd3_mask:
10361034
return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
1037-
case X86::BI__builtin_ia32_vfmaddss:
1038-
case X86::BI__builtin_ia32_vfmaddsd:
1039-
return EmitScalarFMAExpr(*this, E, Ops,
1040-
Constant::getNullValue(Ops[0]->getType()));
10411035
case X86::BI__builtin_ia32_vfmaddsh3_maskz:
10421036
case X86::BI__builtin_ia32_vfmaddss3_maskz:
10431037
case X86::BI__builtin_ia32_vfmaddsd3_maskz:

clang/lib/Headers/fma4intrin.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,16 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
4040
(__v2df)__C);
4141
}
4242

43-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
43+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
4444
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
4545
{
46-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
46+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
4747
}
4848

49-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
49+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
5050
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
5151
{
52-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
52+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
5353
}
5454

5555
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +64,16 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
6464
-(__v2df)__C);
6565
}
6666

67-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
67+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
6868
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
6969
{
70-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
70+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7171
}
7272

73-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
73+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
7474
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
7575
{
76-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
76+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7777
}
7878

7979
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +88,16 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
8888
(__v2df)__C);
8989
}
9090

91-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
91+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
9292
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
9393
{
94-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
94+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
9595
}
9696

97-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
97+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
9898
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
9999
{
100-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
100+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
101101
}
102102

103103
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +112,16 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
112112
-(__v2df)__C);
113113
}
114114

115-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
115+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
116116
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
117117
{
118-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
118+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
119119
}
120120

121-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
121+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
122122
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
123123
{
124-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
124+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
125125
}
126126

127127
static __inline__ __m128 __DEFAULT_FN_ATTRS128

clang/lib/Headers/fmaintrin.h

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,11 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
9191
/// 32 bits.
9292
/// \returns A 128-bit vector of [4 x float] containing the result in the low
9393
/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
94-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
94+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
9595
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
9696
{
97-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
97+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
98+
return __A;
9899
}
99100

100101
/// Computes a scalar multiply-add of the double-precision values in the
@@ -120,10 +121,11 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
120121
/// 64 bits.
121122
/// \returns A 128-bit vector of [2 x double] containing the result in the low
122123
/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
123-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
124+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
124125
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
125126
{
126-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
127+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
128+
return __A;
127129
}
128130

129131
/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -191,10 +193,11 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
191193
/// 32 bits.
192194
/// \returns A 128-bit vector of [4 x float] containing the result in the low
193195
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
194-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
196+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
195197
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
196198
{
197-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
199+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
200+
return __A;
198201
}
199202

200203
/// Computes a scalar multiply-subtract of the double-precision values in
@@ -220,10 +223,11 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
220223
/// 64 bits.
221224
/// \returns A 128-bit vector of [2 x double] containing the result in the low
222225
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
223-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
226+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
224227
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
225228
{
226-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
229+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
230+
return __A;
227231
}
228232

229233
/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -291,10 +295,11 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
291295
/// 32 bits.
292296
/// \returns A 128-bit vector of [4 x float] containing the result in the low
293297
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
294-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
298+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
295299
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
296300
{
297-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
301+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
302+
return __A;
298303
}
299304

300305
/// Computes a scalar negated multiply-add of the double-precision values
@@ -320,10 +325,11 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
320325
/// 64 bits.
321326
/// \returns A 128-bit vector of [2 x double] containing the result in the low
322327
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
323-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
328+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
324329
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
325330
{
326-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
331+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
332+
return __A;
327333
}
328334

329335
/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -391,10 +397,11 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
391397
/// 32 bits.
392398
/// \returns A 128-bit vector of [4 x float] containing the result in the low
393399
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
394-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
400+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
395401
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
396402
{
397-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
403+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
404+
return __A;
398405
}
399406

400407
/// Computes a scalar negated multiply-subtract of the double-precision
@@ -420,10 +427,11 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
420427
/// 64 bits.
421428
/// \returns A 128-bit vector of [2 x double] containing the result in the low
422429
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
423-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
430+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
424431
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
425432
{
426-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
433+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
434+
return __A;
427435
}
428436

429437
/// Computes a multiply with alternating add/subtract of 128-bit vectors of

clang/test/CodeGen/X86/fma-builtins.c

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
2323

2424
__m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
2525
// CHECK-LABEL: test_mm_fmadd_ss
26-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
27-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
28-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
26+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
27+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
28+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
2929
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
30-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
30+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
3131
return _mm_fmadd_ss(a, b, c);
3232
}
33+
TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 1.0f, -2.0f, -0.0f));
3334

3435
__m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
3536
// CHECK-LABEL: test_mm_fmadd_sd
36-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
37-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
38-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
37+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
38+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
39+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
3940
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
40-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
41+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
4142
return _mm_fmadd_sd(a, b, c);
4243
}
44+
TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0));
4345

4446
__m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
4547
// CHECK-LABEL: test_mm_fmsub_ps
@@ -59,25 +61,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
5961

6062
__m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
6163
// CHECK-LABEL: test_mm_fmsub_ss
62-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
63-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
64-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
65-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
66-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
67-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
64+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
65+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
66+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
67+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
68+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]])
69+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
6870
return _mm_fmsub_ss(a, b, c);
6971
}
72+
TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 1.0f, -2.0f, -0.0f));
7073

7174
__m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
7275
// CHECK-LABEL: test_mm_fmsub_sd
73-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
74-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
75-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
76-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
77-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
78-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
76+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
77+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
78+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
79+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
80+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]])
81+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
7982
return _mm_fmsub_sd(a, b, c);
8083
}
84+
TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0));
8185

8286
__m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
8387
// CHECK-LABEL: test_mm_fnmadd_ps
@@ -97,25 +101,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
97101

98102
__m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
99103
// CHECK-LABEL: test_mm_fnmadd_ss
100-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
101-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
102-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
103-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
104-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
105-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
104+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
105+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
106+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
107+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
108+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float %{{.*}})
109+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
106110
return _mm_fnmadd_ss(a, b, c);
107111
}
112+
TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 1.0f, -2.0f, -0.0f));
108113

109114
__m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
110115
// CHECK-LABEL: test_mm_fnmadd_sd
111-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
112-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
113-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
114-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
115-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
116-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
116+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
117+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
118+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
119+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
120+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double %{{.*}})
121+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
117122
return _mm_fnmadd_sd(a, b, c);
118123
}
124+
TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0));
119125

120126
__m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
121127
// CHECK-LABEL: test_mm_fnmsub_ps
@@ -137,27 +143,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
137143

138144
__m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
139145
// CHECK-LABEL: test_mm_fnmsub_ss
140-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
141-
// CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
142-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
143-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
144-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
145-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
146-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
146+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
147+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
148+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
149+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
150+
// CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
151+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float [[NEG2]])
152+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
147153
return _mm_fnmsub_ss(a, b, c);
148154
}
155+
TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 1.0f, -2.0f, -0.0f));
149156

150157
__m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
151158
// CHECK-LABEL: test_mm_fnmsub_sd
152-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
153-
// CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
154-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
155-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
156-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
157-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
158-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
159+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
160+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
161+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
162+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
163+
// CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
164+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double [[NEG2]])
165+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
159166
return _mm_fnmsub_sd(a, b, c);
160167
}
168+
TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0));
161169

162170
__m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
163171
// CHECK-LABEL: test_mm_fmaddsub_ps

0 commit comments

Comments
 (0)