Skip to content

Commit 4407090

Browse files
committed
[Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions
1 parent 20034ba commit 4407090

File tree

6 files changed

+164
-140
lines changed

6 files changed

+164
-140
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -866,16 +866,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
866866
def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
867867
}
868868

869-
let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
870-
def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
871-
def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
872-
}
873-
874-
let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
875-
def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
876-
def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
877-
}
878-
879869
let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
880870
def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
881871
def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;

clang/lib/CodeGen/TargetBuiltins/X86.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
10281028
case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
10291029
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
10301030

1031-
case X86::BI__builtin_ia32_vfmaddss3:
1032-
case X86::BI__builtin_ia32_vfmaddsd3:
10331031
case X86::BI__builtin_ia32_vfmaddsh3_mask:
10341032
case X86::BI__builtin_ia32_vfmaddss3_mask:
10351033
case X86::BI__builtin_ia32_vfmaddsd3_mask:
10361034
return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
1037-
case X86::BI__builtin_ia32_vfmaddss:
1038-
case X86::BI__builtin_ia32_vfmaddsd:
1039-
return EmitScalarFMAExpr(*this, E, Ops,
1040-
Constant::getNullValue(Ops[0]->getType()));
10411035
case X86::BI__builtin_ia32_vfmaddsh3_maskz:
10421036
case X86::BI__builtin_ia32_vfmaddss3_maskz:
10431037
case X86::BI__builtin_ia32_vfmaddsd3_maskz:

clang/lib/Headers/fma4intrin.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,16 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
4040
(__v2df)__C);
4141
}
4242

43-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
43+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
4444
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
4545
{
46-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
46+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
4747
}
4848

49-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
49+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
5050
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
5151
{
52-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
52+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
5353
}
5454

5555
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +64,16 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
6464
-(__v2df)__C);
6565
}
6666

67-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
67+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
6868
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
6969
{
70-
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
70+
return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7171
}
7272

73-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
73+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
7474
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
7575
{
76-
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
76+
return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
7777
}
7878

7979
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +88,16 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
8888
(__v2df)__C);
8989
}
9090

91-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
91+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
9292
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
9393
{
94-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
94+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
9595
}
9696

97-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
97+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
9898
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
9999
{
100-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
100+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
101101
}
102102

103103
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +112,16 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
112112
-(__v2df)__C);
113113
}
114114

115-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
115+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
116116
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
117117
{
118-
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
118+
return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
119119
}
120120

121-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
121+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
122122
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
123123
{
124-
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
124+
return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
125125
}
126126

127127
static __inline__ __m128 __DEFAULT_FN_ATTRS128

clang/lib/Headers/fmaintrin.h

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,11 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
9595
/// 32 bits.
9696
/// \returns A 128-bit vector of [4 x float] containing the result in the low
9797
/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
98-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
98+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
9999
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
100100
{
101-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
101+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
102+
return __A;
102103
}
103104

104105
/// Computes a scalar multiply-add of the double-precision values in the
@@ -124,10 +125,11 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
124125
/// 64 bits.
125126
/// \returns A 128-bit vector of [2 x double] containing the result in the low
126127
/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
127-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
128+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
128129
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
129130
{
130-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
131+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
132+
return __A;
131133
}
132134

133135
/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -195,10 +197,11 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
195197
/// 32 bits.
196198
/// \returns A 128-bit vector of [4 x float] containing the result in the low
197199
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
198-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
200+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
199201
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
200202
{
201-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
203+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
204+
return __A;
202205
}
203206

204207
/// Computes a scalar multiply-subtract of the double-precision values in
@@ -224,10 +227,11 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
224227
/// 64 bits.
225228
/// \returns A 128-bit vector of [2 x double] containing the result in the low
226229
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
227-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
230+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
228231
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
229232
{
230-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
233+
__A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
234+
return __A;
231235
}
232236

233237
/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -295,10 +299,11 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
295299
/// 32 bits.
296300
/// \returns A 128-bit vector of [4 x float] containing the result in the low
297301
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
298-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
302+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
299303
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
300304
{
301-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
305+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
306+
return __A;
302307
}
303308

304309
/// Computes a scalar negated multiply-add of the double-precision values
@@ -324,10 +329,11 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
324329
/// 64 bits.
325330
/// \returns A 128-bit vector of [2 x double] containing the result in the low
326331
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
327-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
332+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
328333
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
329334
{
330-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
335+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
336+
return __A;
331337
}
332338

333339
/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -395,10 +401,11 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
395401
/// 32 bits.
396402
/// \returns A 128-bit vector of [4 x float] containing the result in the low
397403
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
398-
static __inline__ __m128 __DEFAULT_FN_ATTRS128
404+
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
399405
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
400406
{
401-
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
407+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
408+
return __A;
402409
}
403410

404411
/// Computes a scalar negated multiply-subtract of the double-precision
@@ -424,10 +431,11 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
424431
/// 64 bits.
425432
/// \returns A 128-bit vector of [2 x double] containing the result in the low
426433
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
427-
static __inline__ __m128d __DEFAULT_FN_ATTRS128
434+
static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
428435
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
429436
{
430-
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
437+
__A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
438+
return __A;
431439
}
432440

433441
/// Computes a multiply with alternating add/subtract of 128-bit vectors of

clang/test/CodeGen/X86/fma-builtins.c

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
2828

2929
__m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
3030
// CHECK-LABEL: test_mm_fmadd_ss
31-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
32-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
33-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
31+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
32+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
33+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
3434
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
35-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
35+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
3636
return _mm_fmadd_ss(a, b, c);
3737
}
38+
TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 1.0f, -2.0f, -0.0f));
3839

3940
__m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
4041
// CHECK-LABEL: test_mm_fmadd_sd
41-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
42-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
43-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
42+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
43+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
44+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
4445
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
45-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
46+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
4647
return _mm_fmadd_sd(a, b, c);
4748
}
49+
TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0));
4850

4951
__m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
5052
// CHECK-LABEL: test_mm_fmsub_ps
@@ -64,25 +66,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
6466

6567
__m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
6668
// CHECK-LABEL: test_mm_fmsub_ss
67-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
68-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
69-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
70-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
71-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
72-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
69+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
70+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
71+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
72+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
73+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]])
74+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
7375
return _mm_fmsub_ss(a, b, c);
7476
}
77+
TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 1.0f, -2.0f, -0.0f));
7578

7679
__m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
7780
// CHECK-LABEL: test_mm_fmsub_sd
78-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
79-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
80-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
81-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
82-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
83-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
81+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
82+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
83+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
84+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
85+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]])
86+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
8487
return _mm_fmsub_sd(a, b, c);
8588
}
89+
TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0));
8690

8791
__m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
8892
// CHECK-LABEL: test_mm_fnmadd_ps
@@ -102,25 +106,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
102106

103107
__m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
104108
// CHECK-LABEL: test_mm_fnmadd_ss
105-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
106-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
107-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
108-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
109-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
110-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
109+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
110+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
111+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
112+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
113+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float %{{.*}})
114+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
111115
return _mm_fnmadd_ss(a, b, c);
112116
}
117+
TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 1.0f, -2.0f, -0.0f));
113118

114119
__m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
115120
// CHECK-LABEL: test_mm_fnmadd_sd
116-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
117-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
118-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
119-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
120-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
121-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
121+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
122+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
123+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
124+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
125+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double %{{.*}})
126+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
122127
return _mm_fnmadd_sd(a, b, c);
123128
}
129+
TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0));
124130

125131
__m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
126132
// CHECK-LABEL: test_mm_fnmsub_ps
@@ -142,27 +148,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
142148

143149
__m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
144150
// CHECK-LABEL: test_mm_fnmsub_ss
145-
// CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
146-
// CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
147-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
148-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
149-
// CHECK: extractelement <4 x float> %{{.*}}, i64 0
150-
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
151-
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
151+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
152+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
153+
// CHECK: [[NEG:%.+]] = fneg float %{{.+}}
154+
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
155+
// CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
156+
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float [[NEG2]])
157+
// CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
152158
return _mm_fnmsub_ss(a, b, c);
153159
}
160+
TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 1.0f, -2.0f, -0.0f));
154161

155162
__m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
156163
// CHECK-LABEL: test_mm_fnmsub_sd
157-
// CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
158-
// CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
159-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
160-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
161-
// CHECK: extractelement <2 x double> %{{.*}}, i64 0
162-
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
163-
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
164+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
165+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
166+
// CHECK: [[NEG:%.+]] = fneg double %{{.+}}
167+
// CHECK: extractelement <2 x double> %{{.*}}, i32 0
168+
// CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
169+
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double [[NEG2]])
170+
// CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
164171
return _mm_fnmsub_sd(a, b, c);
165172
}
173+
TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0));
166174

167175
__m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
168176
// CHECK-LABEL: test_mm_fmaddsub_ps

0 commit comments

Comments
 (0)