Skip to content

Commit 5038e2f

Browse files
authored
[libc] Provide more fine-grained control of FMA instruction for ARM targets. (llvm#130700)
1 parent f1e3675 commit 5038e2f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+149
-105
lines changed

libc/src/__support/FPUtil/FMA.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,26 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
2424
}
2525

2626
#ifdef LIBC_TARGET_CPU_HAS_FMA
27+
28+
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
2729
template <> LIBC_INLINE float fma(float x, float y, float z) {
2830
#if __has_builtin(__builtin_elementwise_fma)
2931
return __builtin_elementwise_fma(x, y, z);
3032
#else
3133
return __builtin_fmaf(x, y, z);
3234
#endif
3335
}
36+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
3437

38+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
3539
template <> LIBC_INLINE double fma(double x, double y, double z) {
3640
#if __has_builtin(__builtin_elementwise_fma)
3741
return __builtin_elementwise_fma(x, y, z);
3842
#else
3943
return __builtin_fma(x, y, z);
4044
#endif
4145
}
46+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
4247
#endif // LIBC_TARGET_CPU_HAS_FMA
4348

4449
} // namespace fputil

libc/src/__support/FPUtil/double_double.h

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
100100
return r;
101101
}
102102

103+
// The templated exact multiplication needs template version of
104+
// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
105+
// These can be moved to "src/__support/macros/properties/cpu_features.h" if
106+
// other part of libc needed.
107+
template <typename T> struct TargetHasFmaInstruction {
108+
static constexpr bool VALUE = false;
109+
};
110+
111+
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
112+
template <> struct TargetHasFmaInstruction<float> {
113+
static constexpr bool VALUE = true;
114+
};
115+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
116+
117+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
118+
template <> struct TargetHasFmaInstruction<double> {
119+
static constexpr bool VALUE = true;
120+
};
121+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
122+
103123
// Note: When FMA instruction is not available, the `exact_mult` function is
104124
// only correct for round-to-nearest mode. See:
105125
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
111131
LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
112132
NumberPair<T> r{0.0, 0.0};
113133

114-
#ifdef LIBC_TARGET_CPU_HAS_FMA
115-
r.hi = a * b;
116-
r.lo = fputil::multiply_add(a, b, -r.hi);
117-
#else
118-
// Dekker's Product.
119-
NumberPair<T> as = split(a);
134+
if constexpr (TargetHasFmaInstruction<T>::VALUE) {
135+
r.hi = a * b;
136+
r.lo = fputil::multiply_add(a, b, -r.hi);
137+
} else {
138+
// Dekker's Product.
139+
NumberPair<T> as = split(a);
120140

121-
r = exact_mult<T, SPLIT_B>(as, a, b);
122-
#endif // LIBC_TARGET_CPU_HAS_FMA
141+
r = exact_mult<T, SPLIT_B>(as, a, b);
142+
}
123143

124144
return r;
125145
}

libc/src/__support/FPUtil/multiply_add.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,25 @@ multiply_add(T x, T y, T z) {
4646
namespace LIBC_NAMESPACE_DECL {
4747
namespace fputil {
4848

49+
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
4950
LIBC_INLINE float multiply_add(float x, float y, float z) {
5051
#if __has_builtin(__builtin_elementwise_fma)
5152
return __builtin_elementwise_fma(x, y, z);
5253
#else
5354
return __builtin_fmaf(x, y, z);
5455
#endif
5556
}
57+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
5658

59+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
5760
LIBC_INLINE double multiply_add(double x, double y, double z) {
5861
#if __has_builtin(__builtin_elementwise_fma)
5962
return __builtin_elementwise_fma(x, y, z);
6063
#else
6164
return __builtin_fma(x, y, z);
6265
#endif
6366
}
67+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
6468

6569
} // namespace fputil
6670
} // namespace LIBC_NAMESPACE_DECL

libc/src/__support/macros/properties/cpu_features.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,21 @@
4545
#if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) || \
4646
defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
4747
#define LIBC_TARGET_CPU_HAS_FMA
48+
// Provide a more fine-grained control of FMA instruction for ARM targets.
49+
#if defined(__ARM_FP)
50+
#if (__ARM_FP & 0x2)
51+
#define LIBC_TARGET_CPU_HAS_FMA_HALF
52+
#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
53+
#if (__ARM_FP & 0x4)
54+
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
55+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
56+
#if (__ARM_FP & 0x8)
57+
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
58+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
59+
#else
60+
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
61+
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
62+
#endif
4863
#endif
4964

5065
#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \

libc/src/math/generic/asinf.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
7474
// |x| < 2^-125. For targets without FMA instructions, we simply use
7575
// double for intermediate results as it is more efficient than using an
7676
// emulated version of FMA.
77-
#if defined(LIBC_TARGET_CPU_HAS_FMA)
77+
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
7878
return fputil::multiply_add(x, 0x1.0p-25f, x);
7979
#else
8080
double xd = static_cast<double>(x);
8181
return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
82-
#endif // LIBC_TARGET_CPU_HAS_FMA
82+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
8383
}
8484

8585
// Check for exceptional values

libc/src/math/generic/atan2f.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
131131
num_r = num_d;
132132
den_r = den_d;
133133
}
134-
#ifdef LIBC_TARGET_CPU_HAS_FMA
134+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
135135
q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
136136
#else
137137
// Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
@@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
140140
double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
141141
double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
142142
q.lo = t2 / den_r;
143-
#endif // LIBC_TARGET_CPU_HAS_FMA
143+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
144144

145145
// Taylor polynomial, evaluating using Horner's scheme:
146146
// P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15

libc/src/math/generic/atanf.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
5252
return x;
5353
// x <= 2^-12;
5454
if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
55-
#if defined(LIBC_TARGET_CPU_HAS_FMA)
55+
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
5656
return fputil::multiply_add(x, -0x1.0p-25f, x);
5757
#else
5858
double x_d = static_cast<double>(x);
5959
return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
60-
#endif // LIBC_TARGET_CPU_HAS_FMA
60+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
6161
}
6262
// Use Taylor polynomial:
6363
// atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).

libc/src/math/generic/cbrt.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ double intial_approximation(double x) {
5858

5959
// Get the error term for Newton iteration:
6060
// h(x) = x^3 * a^2 - 1,
61-
#ifdef LIBC_TARGET_CPU_HAS_FMA
61+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
6262
double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
6363
return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
6464
fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);

libc/src/math/generic/cos.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@
2020
#include "src/math/generic/range_reduction_double_common.h"
2121
#include "src/math/generic/sincos_eval.h"
2222

23-
#ifdef LIBC_TARGET_CPU_HAS_FMA
23+
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
2424
#include "range_reduction_double_fma.h"
2525
#else
2626
#include "range_reduction_double_nofma.h"
27-
#endif // LIBC_TARGET_CPU_HAS_FMA
27+
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
2828

2929
namespace LIBC_NAMESPACE_DECL {
3030

libc/src/math/generic/cosf.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
101101
// |x| < 2^-125. For targets without FMA instructions, we simply use
102102
// double for intermediate results as it is more efficient than using an
103103
// emulated version of FMA.
104-
#if defined(LIBC_TARGET_CPU_HAS_FMA)
104+
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
105105
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
106106
#else
107107
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
108-
#endif // LIBC_TARGET_CPU_HAS_FMA
108+
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
109109
}
110110

111111
if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))

0 commit comments

Comments
 (0)