-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Headers][X86] Update FMA3/FMA4 scalar intrinsics to use __builtin_elementwise_fma and support constexpr #154731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
e15a719 to
3c9d5ad
Compare
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) ChangesNow that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions Fixes #154555 Patch is 30.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154731.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 527acd9ef086e..8b0d850c27504 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -857,16 +857,6 @@ let Features = "sha", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def sha256msg2 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}
-let Features = "fma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vfmaddss3 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def vfmaddsd3 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
-let Features = "fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vfmaddss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
- def vfmaddsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
-}
-
let Features = "fma|fma4", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vfmaddsubps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
def vfmaddsubpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b9248a7d43f85..d8609c3e00f4f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -1028,16 +1028,10 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
- case X86::BI__builtin_ia32_vfmaddss3:
- case X86::BI__builtin_ia32_vfmaddsd3:
case X86::BI__builtin_ia32_vfmaddsh3_mask:
case X86::BI__builtin_ia32_vfmaddss3_mask:
case X86::BI__builtin_ia32_vfmaddsd3_mask:
return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
- case X86::BI__builtin_ia32_vfmaddss:
- case X86::BI__builtin_ia32_vfmaddsd:
- return EmitScalarFMAExpr(*this, E, Ops,
- Constant::getNullValue(Ops[0]->getType()));
case X86::BI__builtin_ia32_vfmaddsh3_maskz:
case X86::BI__builtin_ia32_vfmaddss3_maskz:
case X86::BI__builtin_ia32_vfmaddsd3_maskz:
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index e0a0e4c968950..426be2d3c3658 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -40,16 +40,16 @@ _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
(__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], __C[0]));
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -64,16 +64,16 @@ _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
-(__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+ return _mm_set_ss(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+ return _mm_set_sd(__builtin_elementwise_fma(__A[0], __B[0], -__C[0]));
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -88,16 +88,16 @@ _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
(__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], __C[0]));
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
@@ -112,16 +112,16 @@ _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
-(__v2df)__C);
}
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+ return _mm_set_ss(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
}
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+ return _mm_set_sd(__builtin_elementwise_fma(-__A[0], __B[0], -__C[0]));
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index d8ea489022b8f..2aae620e04fb9 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -91,10 +91,11 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+ return __A;
}
/// Computes a scalar multiply-add of the double-precision values in the
@@ -120,10 +121,11 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], __B[0], __C[0]);
+ return __A;
}
/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -191,10 +193,11 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+ return __A;
}
/// Computes a scalar multiply-subtract of the double-precision values in
@@ -220,10 +223,11 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], __B[0], -__C[0]);
+ return __A;
}
/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -291,10 +295,11 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+ return __A;
}
/// Computes a scalar negated multiply-add of the double-precision values
@@ -320,10 +325,11 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], __C[0]);
+ return __A;
}
/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -391,10 +397,11 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+ return __A;
}
/// Computes a scalar negated multiply-subtract of the double-precision
@@ -420,10 +427,11 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+ __A[0] = __builtin_elementwise_fma(__A[0], -__B[0], -__C[0]);
+ return __A;
}
/// Computes a multiply with alternating add/subtract of 128-bit vectors of
diff --git a/clang/test/CodeGen/X86/fma-builtins.c b/clang/test/CodeGen/X86/fma-builtins.c
index 8e9822ec6ad2f..5601980166697 100644
--- a/clang/test/CodeGen/X86/fma-builtins.c
+++ b/clang/test/CodeGen/X86/fma-builtins.c
@@ -23,23 +23,25 @@ TEST_CONSTEXPR(match_m128d(_mm_fmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
__m128 test_mm_fmadd_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmadd_ss
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
// CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
- // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+ // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
return _mm_fmadd_ss(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -7.0f, 1.0f, -2.0f, -0.0f));
__m128d test_mm_fmadd_sd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fmadd_sd
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
// CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
- // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+ // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
return _mm_fmadd_sd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -12.0, 1.0));
__m128 test_mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmsub_ps
@@ -59,25 +61,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0,
__m128 test_mm_fmsub_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmsub_ss
- // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
- // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+ // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float [[NEG]])
+ // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
return _mm_fmsub_ss(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), -9.0f, 1.0f, -2.0f, -0.0f));
__m128d test_mm_fmsub_sd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fmsub_sd
- // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
- // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+ // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double [[NEG]])
+ // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
return _mm_fmsub_sd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 4.0, 1.0));
__m128 test_mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmadd_ps
@@ -97,25 +101,27 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmadd_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
__m128 test_mm_fnmadd_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmadd_ss
- // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
- // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float %{{.*}})
+ // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
return _mm_fnmadd_ss(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fnmadd_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 9.0f, 1.0f, -2.0f, -0.0f));
__m128d test_mm_fnmadd_sd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fnmadd_sd
- // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
- // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double %{{.*}})
+ // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
return _mm_fnmadd_sd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fnmadd_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), -4.0, 1.0));
__m128 test_mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmsub_ps
@@ -137,27 +143,29 @@ TEST_CONSTEXPR(match_m128d(_mm_fnmsub_pd((__m128d){ 0.0, -4.0 }, (__m128d){ -0.0
__m128 test_mm_fnmsub_ss(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fnmsub_ss
- // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.+}}
- // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.+}}
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: extractelement <4 x float> %{{.*}}, i64 0
- // CHECK: call float @llvm.fma.f32(float %{{.*}}, float %{{.*}}, float %{{.*}})
- // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg float %{{.+}}
+ // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: [[NEG2:%.+]] = fneg float %{{.+}}
+ // CHECK: call float @llvm.fma.f32(float %{{.*}}, float [[NEG]], float [[NEG2]])
+ // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
return _mm_fnmsub_ss(a, b, c);
}
+TEST_CONSTEXPR(match_m128(_mm_fnmsub_ss((__m128){ -4.0f, 1.0f, -2.0f, -0.0f }, (__m128){ 2.0f, 4.0f, 2.0f, -0.0f }, (__m128){ 1.0f, -4.0f, 2.0f, 1.0f }), 7.0f, 1.0f, -2.0f, -0.0f));
__m128d test_mm_fnmsub_sd(__m128d a, __m128d b, __m128d c) {
// CHECK-LABEL: test_mm_fnmsub_sd
- // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.+}}
- // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.+}}
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: extractelement <2 x double> %{{.*}}, i64 0
- // CHECK: call double @llvm.fma.f64(double %{{.*}}, double %{{.*}}, double %{{.*}})
- // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: [[NEG:%.+]] = fneg double %{{.+}}
+ // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: [[NEG2:%.+]] = fneg double %{{.+}}
+ // CHECK: call double @llvm.fma.f64(double %{{.*}}, double [[NEG]], double [[NEG2]])
+ // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
return _mm_fnmsub_sd(a, b, c);
}
+TEST_CONSTEXPR(match_m128d(_mm_fnmsub_sd((__m128d){ -4.0, 1.0 }, (__m128d){ 1.0, 2.0 }, (__m128d){ -8.0, 3.0 }), 12.0, 1.0));
__m128 test_mm_fmaddsub_ps(__m128 a, __m128 b, __m128 c) {
// CHECK-LABEL: test_mm_fmaddsub_ps
diff --git a/clang/test/CodeGen/X86/fma4-builtins.c b/clang/test/CodeGen/X86/fma4-...
[truncated]
|
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
|
@phoebewang what do you think about going back to using the __builtin_elementwise_fma calls and just getting this in? #154747 appears to have derailed - and using __builtin_elementwise_fma does have the benefit of the scalar and vector FMA3/4 intrinsics having exactly the same behaviour. |
__builtin_elementwise_fma sounds better. I thought we have replaced all with it :) |
…ementwise_fma and support constexpr Now that llvm#152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions
b404ed7 to
4407090
Compare
Now that #152455 is done, we can make all the scalar fma intrinsics to wrap __builtin_elementwise_fma, which also allows constexpr
The main difference is that FMA4 intrinsics guarantee that the upper elements are zero, while FMA3 passes through the destination register elements like older scalar instructions
Fixes #154555