Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions libc/src/__support/FPUtil/FMA.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,26 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
}

#ifdef LIBC_TARGET_CPU_HAS_FMA

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
template <> LIBC_INLINE float fma(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fmaf(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
template <> LIBC_INLINE double fma(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fma(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif // LIBC_TARGET_CPU_HAS_FMA

} // namespace fputil
Expand Down
36 changes: 28 additions & 8 deletions libc/src/__support/FPUtil/double_double.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
return r;
}

// The templated exact multiplication needs template version of
// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
// These can be moved to "src/__support/macros/properties/cpu_features.h" if
// other part of libc needed.
template <typename T> struct TargetHasFmaInstruction {
static constexpr bool VALUE = false;
};

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
template <> struct TargetHasFmaInstruction<float> {
static constexpr bool VALUE = true;
};
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
template <> struct TargetHasFmaInstruction<double> {
static constexpr bool VALUE = true;
};
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// Note: When FMA instruction is not available, the `exact_mult` function is
// only correct for round-to-nearest mode. See:
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
Expand All @@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
NumberPair<T> r{0.0, 0.0};

#ifdef LIBC_TARGET_CPU_HAS_FMA
r.hi = a * b;
r.lo = fputil::multiply_add(a, b, -r.hi);
#else
// Dekker's Product.
NumberPair<T> as = split(a);
if constexpr (TargetHasFmaInstruction<T>::VALUE) {
r.hi = a * b;
r.lo = fputil::multiply_add(a, b, -r.hi);
} else {
// Dekker's Product.
NumberPair<T> as = split(a);

r = exact_mult<T, SPLIT_B>(as, a, b);
#endif // LIBC_TARGET_CPU_HAS_FMA
r = exact_mult<T, SPLIT_B>(as, a, b);
}

return r;
}
Expand Down
4 changes: 4 additions & 0 deletions libc/src/__support/FPUtil/multiply_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,25 @@ multiply_add(T x, T y, T z) {
namespace LIBC_NAMESPACE_DECL {
namespace fputil {

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
LIBC_INLINE float multiply_add(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fmaf(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
LIBC_INLINE double multiply_add(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fma(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

} // namespace fputil
} // namespace LIBC_NAMESPACE_DECL
Expand Down
15 changes: 15 additions & 0 deletions libc/src/__support/macros/properties/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@
#if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) || \
defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
#define LIBC_TARGET_CPU_HAS_FMA
// Provide a more fine-grained control of FMA instruction for ARM targets.
#if defined(__ARM_FP)
#if (__ARM_FP & 0x2)
#define LIBC_TARGET_CPU_HAS_FMA_HALF
#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
#if (__ARM_FP & 0x4)
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
#if (__ARM_FP & 0x8)
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#else
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif
#endif

#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/asinf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, 0x1.0p-25f, x);
#else
double xd = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

// Check for exceptional values
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/atan2f.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
num_r = num_d;
den_r = den_d;
}
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
#else
// Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
Expand All @@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
q.lo = t2 / den_r;
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// Taylor polynomial, evaluating using Horner's scheme:
// P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/atanf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
return x;
// x <= 2^-12;
if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, -0x1.0p-25f, x);
#else
double x_d = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// Use Taylor polynomial:
// atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
Expand Down
2 changes: 1 addition & 1 deletion libc/src/math/generic/cbrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ double intial_approximation(double x) {

// Get the error term for Newton iteration:
// h(x) = x^3 * a^2 - 1,
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

namespace LIBC_NAMESPACE_DECL {

Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cosf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cospif.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
// The exhautive test passes for smaller values
if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {

#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

// Numbers greater or equal to 2^23 are always integers or NaN
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp10f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

namespace LIBC_NAMESPACE_DECL {

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10F16_EXCEPTS = 5;
#else
static constexpr size_t N_EXP10F16_EXCEPTS = 8;
Expand All @@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
{0xbf0aU, 0x2473U, 1U, 0U, 0U},
// x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
{0xc387U, 0x09a5U, 1U, 0U, 0U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
{0x4030U, 0x57c1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp10m1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
{0x9788U, 0x9c53U, 0U, 1U, 0U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
#else
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
Expand All @@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
{0x3657U, 0x3df6U, 1U, 0U, 0U},
// x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
{0x3741U, 0x3f5cU, 1U, 0U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
{0x4030U, 0x57b1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128;

// Error bounds:
// Errors when using double precision.
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
constexpr double ERR_D = 0x1.0p-63;
#else
constexpr double ERR_D = 0x1.8p-63;
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
// Errors when using double-double precision.
Expand Down
8 changes: 4 additions & 4 deletions libc/src/math/generic/exp2m1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
{0x973fU, 0x9505U, 0U, 1U, 0U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
#else
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
Expand All @@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
// (input, RZ output, RU offset, RD offset, RN offset)
// x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
{0x3396U, 0x31b7U, 1U, 0U, 0U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
{0x34baU, 0x3345U, 1U, 0U, 0U},
#endif
// x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
{0x36b6U, 0x3566U, 1U, 0U, 0U},
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
{0x37b7U, 0x3659U, 1U, 0U, 1U},
#endif
Expand All @@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
{0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
// x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
{0xb8a5U, 0xb54cU, 0U, 1U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
{0xba8dU, 0xb6edU, 0U, 1U, 1U},
#endif
Expand Down
10 changes: 5 additions & 5 deletions libc/src/math/generic/expm1f.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
return 0x1.8dbe62p-3f;
}

#if !defined(LIBC_TARGET_CPU_HAS_FMA)
#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
int round_mode = fputil::quick_get_round();
if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
return -0x1.71c884p-4f;
return -0x1.71c882p-4f;
}
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// When |x| > 25*log(2), or nan
if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) {
Expand Down Expand Up @@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
// 2^-76. For targets without FMA instructions, we simply use double for
// intermediate results as it is more efficient than using an emulated
// version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
return fputil::fma<float>(x, x, x);
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, x, x);
#else
double xd = x;
return static_cast<float>(fputil::multiply_add(xd, xd, xd));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

constexpr double COEFFS[] = {0x1p-1,
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/expm1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
{0x2959U, 0x2975U, 1U, 0U, 1U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
#else
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
Expand All @@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
{0x3f0dU, 0x44d3U, 1U, 0U, 1U},
// x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
{0xb38aU, 0xb2b7U, 0U, 1U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
{0x3282U, 0x3337U, 1U, 0U, 0U},
#endif
Expand Down
2 changes: 1 addition & 1 deletion libc/src/math/generic/fmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
// correctly rounded for all rounding modes, so we fall
// back to the generic `fmul` implementation

#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return fputil::generic::mul<float>(x, y);
#else
fputil::DoubleDouble prod = fputil::exact_mult(x, y);
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/hypotf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {

// These squares are exact.
double a_sq = ad * ad;
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq = fputil::multiply_add(bd, bd, a_sq);
#else
double b_sq = bd * bd;
Expand All @@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
double r_d = result.get_val();

// Perform rounding correction.
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
#else
Expand Down
Loading