Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions libc/src/__support/FPUtil/FMA.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,26 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
}

#ifdef LIBC_TARGET_CPU_HAS_FMA

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
template <> LIBC_INLINE float fma(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fmaf(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
template <> LIBC_INLINE double fma(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fma(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif // LIBC_TARGET_CPU_HAS_FMA

} // namespace fputil
Expand Down
36 changes: 28 additions & 8 deletions libc/src/__support/FPUtil/double_double.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
return r;
}

// The templated exact multiplication needs template version of
// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
// These can be moved to "src/__support/macros/properties/cpu_features.h" if
// other part of libc needed.
template <typename T> struct TargetHasFmaInstruction {
static constexpr bool VALUE = false;
};

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
template <> struct TargetHasFmaInstruction<float> {
static constexpr bool VALUE = true;
};
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
template <> struct TargetHasFmaInstruction<double> {
static constexpr bool VALUE = true;
};
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// Note: When FMA instruction is not available, the `exact_mult` function is
// only correct for round-to-nearest mode. See:
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
Expand All @@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
NumberPair<T> r{0.0, 0.0};

#ifdef LIBC_TARGET_CPU_HAS_FMA
r.hi = a * b;
r.lo = fputil::multiply_add(a, b, -r.hi);
#else
// Dekker's Product.
NumberPair<T> as = split(a);
if constexpr (TargetHasFmaInstruction<T>::VALUE) {
r.hi = a * b;
r.lo = fputil::multiply_add(a, b, -r.hi);
} else {
// Dekker's Product.
NumberPair<T> as = split(a);

r = exact_mult<T, SPLIT_B>(as, a, b);
#endif // LIBC_TARGET_CPU_HAS_FMA
r = exact_mult<T, SPLIT_B>(as, a, b);
}

return r;
}
Expand Down
4 changes: 4 additions & 0 deletions libc/src/__support/FPUtil/multiply_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,25 @@ multiply_add(T x, T y, T z) {
namespace LIBC_NAMESPACE_DECL {
namespace fputil {

#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
LIBC_INLINE float multiply_add(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fmaf(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT

#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
LIBC_INLINE double multiply_add(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
#else
return __builtin_fma(x, y, z);
#endif
}
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

} // namespace fputil
} // namespace LIBC_NAMESPACE_DECL
Expand Down
15 changes: 15 additions & 0 deletions libc/src/__support/macros/properties/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@
#if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) || \
defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
#define LIBC_TARGET_CPU_HAS_FMA
// Provide a more fine-grained control of FMA instruction for ARM targets.
#if defined(__ARM_FP)
#if (__ARM_FP & 0x2)
#define LIBC_TARGET_CPU_HAS_FMA_HALF
#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
#if (__ARM_FP & 0x4)
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
#if (__ARM_FP & 0x8)
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#else
#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif
#endif

#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/asinf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, 0x1.0p-25f, x);
#else
double xd = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

// Check for exceptional values
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/atan2f.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
num_r = num_d;
den_r = den_d;
}
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
#else
// Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
Expand All @@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
q.lo = t2 / den_r;
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// Taylor polynomial, evaluating using Horner's scheme:
// P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/atanf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
return x;
// x <= 2^-12;
if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, -0x1.0p-25f, x);
#else
double x_d = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// Use Taylor polynomial:
// atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
Expand Down
2 changes: 1 addition & 1 deletion libc/src/math/generic/cbrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ double intial_approximation(double x) {

// Get the error term for Newton iteration:
// h(x) = x^3 * a^2 - 1,
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

namespace LIBC_NAMESPACE_DECL {

Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cosf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/cospif.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
// The exhautive test passes for smaller values
if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {

#if defined(LIBC_TARGET_CPU_HAS_FMA)
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

// Numbers greater or equal to 2^23 are always integers or NaN
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp10f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

namespace LIBC_NAMESPACE_DECL {

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10F16_EXCEPTS = 5;
#else
static constexpr size_t N_EXP10F16_EXCEPTS = 8;
Expand All @@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
{0xbf0aU, 0x2473U, 1U, 0U, 0U},
// x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
{0xc387U, 0x09a5U, 1U, 0U, 0U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
{0x4030U, 0x57c1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp10m1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
{0x9788U, 0x9c53U, 0U, 1U, 0U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
#else
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
Expand All @@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
{0x3657U, 0x3df6U, 1U, 0U, 0U},
// x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
{0x3741U, 0x3f5cU, 1U, 0U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
{0x4030U, 0x57b1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/exp2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128;

// Error bounds:
// Errors when using double precision.
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
constexpr double ERR_D = 0x1.0p-63;
#else
constexpr double ERR_D = 0x1.8p-63;
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
// Errors when using double-double precision.
Expand Down
8 changes: 4 additions & 4 deletions libc/src/math/generic/exp2m1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
{0x973fU, 0x9505U, 0U, 1U, 0U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
#else
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
Expand All @@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
// (input, RZ output, RU offset, RD offset, RN offset)
// x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
{0x3396U, 0x31b7U, 1U, 0U, 0U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
{0x34baU, 0x3345U, 1U, 0U, 0U},
#endif
// x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
{0x36b6U, 0x3566U, 1U, 0U, 0U},
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
{0x37b7U, 0x3659U, 1U, 0U, 1U},
#endif
Expand All @@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
{0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
// x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
{0xb8a5U, 0xb54cU, 0U, 1U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
{0xba8dU, 0xb6edU, 0U, 1U, 1U},
#endif
Expand Down
10 changes: 5 additions & 5 deletions libc/src/math/generic/expm1f.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
return 0x1.8dbe62p-3f;
}

#if !defined(LIBC_TARGET_CPU_HAS_FMA)
#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
int round_mode = fputil::quick_get_round();
if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
return -0x1.71c884p-4f;
return -0x1.71c882p-4f;
}
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE

// When |x| > 25*log(2), or nan
if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) {
Expand Down Expand Up @@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
// 2^-76. For targets without FMA instructions, we simply use double for
// intermediate results as it is more efficient than using an emulated
// version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA)
return fputil::fma<float>(x, x, x);
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add<float>(x, x, x);
#else
double xd = x;
return static_cast<float>(fputil::multiply_add(xd, xd, xd));
#endif // LIBC_TARGET_CPU_HAS_FMA
#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}

constexpr double COEFFS[] = {0x1p-1,
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/expm1f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
{0x2959U, 0x2975U, 1U, 0U, 1U},
}};

#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
#else
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
Expand All @@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
{0x3f0dU, 0x44d3U, 1U, 0U, 1U},
// x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
{0xb38aU, 0xb2b7U, 0U, 1U, 1U},
#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
{0x3282U, 0x3337U, 1U, 0U, 0U},
#endif
Expand Down
2 changes: 1 addition & 1 deletion libc/src/math/generic/fmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
// correctly rounded for all rounding modes, so we fall
// back to the generic `fmul` implementation

#ifndef LIBC_TARGET_CPU_HAS_FMA
#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return fputil::generic::mul<float>(x, y);
#else
fputil::DoubleDouble prod = fputil::exact_mult(x, y);
Expand Down
4 changes: 2 additions & 2 deletions libc/src/math/generic/hypotf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {

// These squares are exact.
double a_sq = ad * ad;
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq = fputil::multiply_add(bd, bd, a_sq);
#else
double b_sq = bd * bd;
Expand All @@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
double r_d = result.get_val();

// Perform rounding correction.
#ifdef LIBC_TARGET_CPU_HAS_FMA
#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
#else
Expand Down
Loading
Loading