diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h index 1e40d06dc1462..2cafb4c0974e3 100644 --- a/libc/src/__support/FPUtil/FMA.h +++ b/libc/src/__support/FPUtil/FMA.h @@ -24,6 +24,8 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) { } #ifdef LIBC_TARGET_CPU_HAS_FMA + +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT template <> LIBC_INLINE float fma(float x, float y, float z) { #if __has_builtin(__builtin_elementwise_fma) return __builtin_elementwise_fma(x, y, z); @@ -31,7 +33,9 @@ template <> LIBC_INLINE float fma(float x, float y, float z) { return __builtin_fmaf(x, y, z); #endif } +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE template <> LIBC_INLINE double fma(double x, double y, double z) { #if __has_builtin(__builtin_elementwise_fma) return __builtin_elementwise_fma(x, y, z); @@ -39,6 +43,7 @@ template <> LIBC_INLINE double fma(double x, double y, double z) { return __builtin_fma(x, y, z); #endif } +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE #endif // LIBC_TARGET_CPU_HAS_FMA } // namespace fputil diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h index b24ffd4aa456f..c27885aadc028 100644 --- a/libc/src/__support/FPUtil/double_double.h +++ b/libc/src/__support/FPUtil/double_double.h @@ -100,6 +100,26 @@ LIBC_INLINE NumberPair exact_mult(const NumberPair &as, T a, T b) { return r; } +// The templated exact multiplication needs template version of +// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation. +// These can be moved to "src/__support/macros/properties/cpu_features.h" if +// other part of libc needed. +template struct TargetHasFmaInstruction { + static constexpr bool VALUE = false; +}; + +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT +template <> struct TargetHasFmaInstruction { + static constexpr bool VALUE = true; +}; +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT + +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE +template <> struct TargetHasFmaInstruction { + static constexpr bool VALUE = true; +}; +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + // Note: When FMA instruction is not available, the `exact_mult` function is // only correct for round-to-nearest mode. See: // Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed @@ -111,15 +131,15 @@ template ::VALUE> LIBC_INLINE NumberPair exact_mult(T a, T b) { NumberPair r{0.0, 0.0}; -#ifdef LIBC_TARGET_CPU_HAS_FMA - r.hi = a * b; - r.lo = fputil::multiply_add(a, b, -r.hi); -#else - // Dekker's Product. - NumberPair as = split(a); + if constexpr (TargetHasFmaInstruction::VALUE) { + r.hi = a * b; + r.lo = fputil::multiply_add(a, b, -r.hi); + } else { + // Dekker's Product. + NumberPair as = split(a); - r = exact_mult(as, a, b); -#endif // LIBC_TARGET_CPU_HAS_FMA + r = exact_mult(as, a, b); + } return r; } diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h index ae00e08673d08..8260702e2c9f4 100644 --- a/libc/src/__support/FPUtil/multiply_add.h +++ b/libc/src/__support/FPUtil/multiply_add.h @@ -46,6 +46,7 @@ multiply_add(T x, T y, T z) { namespace LIBC_NAMESPACE_DECL { namespace fputil { +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT LIBC_INLINE float multiply_add(float x, float y, float z) { #if __has_builtin(__builtin_elementwise_fma) return __builtin_elementwise_fma(x, y, z); @@ -53,7 +54,9 @@ LIBC_INLINE float multiply_add(float x, float y, float z) { return __builtin_fmaf(x, y, z); #endif } +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE LIBC_INLINE double multiply_add(double x, double y, double z) { #if __has_builtin(__builtin_elementwise_fma) return __builtin_elementwise_fma(x, y, z); @@ -61,6 +64,7 @@ LIBC_INLINE double multiply_add(double x, double y, double z) { return __builtin_fma(x, y, z); #endif } +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } // namespace fputil } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h index d2cea367516db..1714775ca334d 100644 --- a/libc/src/__support/macros/properties/cpu_features.h +++ b/libc/src/__support/macros/properties/cpu_features.h @@ -45,6 +45,21 @@ #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) || \ defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA) #define LIBC_TARGET_CPU_HAS_FMA +// Provide a more fine-grained control of FMA instruction for ARM targets. +#if defined(__ARM_FP) +#if (__ARM_FP & 0x2) +#define LIBC_TARGET_CPU_HAS_FMA_HALF +#endif // LIBC_TARGET_CPU_HAS_FMA_HALF +#if (__ARM_FP & 0x4) +#define LIBC_TARGET_CPU_HAS_FMA_FLOAT +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT +#if (__ARM_FP & 0x8) +#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE +#else +#define LIBC_TARGET_CPU_HAS_FMA_FLOAT +#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE +#endif #endif #if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \ diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp index 3a89def8f6e0c..b54a9e7b2b00b 100644 --- a/libc/src/math/generic/asinf.cpp +++ b/libc/src/math/generic/asinf.cpp @@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) { // |x| < 2^-125. For targets without FMA instructions, we simply use // double for intermediate results as it is more efficient than using an // emulated version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(x, 0x1.0p-25f, x); #else double xd = static_cast(x); return static_cast(fputil::multiply_add(xd, 0x1.0p-25, xd)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } // Check for exceptional values diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp index 5ac2b29438ea9..726cae9c8462b 100644 --- a/libc/src/math/generic/atan2f.cpp +++ b/libc/src/math/generic/atan2f.cpp @@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx, num_r = num_d; den_r = den_d; } -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r; #else // Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA @@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx, double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1); q.lo = t2 / den_r; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // Taylor polynomial, evaluating using Horner's scheme: // P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15 diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp index 5e0788efbeb88..46196dbe4162c 100644 --- a/libc/src/math/generic/atanf.cpp +++ b/libc/src/math/generic/atanf.cpp @@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return x; // x <= 2^-12; if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) { -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(x, -0x1.0p-25f, x); #else double x_d = static_cast(x); return static_cast(fputil::multiply_add(x_d, -0x1.0p-25, x_d)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } // Use Taylor polynomial: // atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11). diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp index ee7d69b2c211f..ce227e6650c84 100644 --- a/libc/src/math/generic/cbrt.cpp +++ b/libc/src/math/generic/cbrt.cpp @@ -58,7 +58,7 @@ double intial_approximation(double x) { // Get the error term for Newton iteration: // h(x) = x^3 * a^2 - 1, -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) { return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) + fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo); diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp index 568b1254c6f02..b60082bf9c308 100644 --- a/libc/src/math/generic/cos.cpp +++ b/libc/src/math/generic/cos.cpp @@ -20,11 +20,11 @@ #include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE #include "range_reduction_double_fma.h" #else #include "range_reduction_double_nofma.h" -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp index 972ffa923aedf..23e3db067e669 100644 --- a/libc/src/math/generic/cosf.cpp +++ b/libc/src/math/generic/cosf.cpp @@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) { // |x| < 2^-125. For targets without FMA instructions, we simply use // double for intermediate results as it is more efficient than using an // emulated version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f); #else return static_cast(fputil::multiply_add(xd, -0x1.0p-25, 1.0)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value())) diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp index 4ef1539539921..29566f4fceacf 100644 --- a/libc/src/math/generic/cospif.cpp +++ b/libc/src/math/generic/cospif.cpp @@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) { // The exhautive test passes for smaller values if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) { -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f); #else return static_cast(fputil::multiply_add(xd, -0x1.0p-25, 1.0)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } // Numbers greater or equal to 2^23 are always integers or NaN diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp index 006dd5c554428..f2002e9f146c0 100644 --- a/libc/src/math/generic/exp10f16.cpp +++ b/libc/src/math/generic/exp10f16.cpp @@ -26,7 +26,7 @@ namespace LIBC_NAMESPACE_DECL { -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_EXP10F16_EXCEPTS = 5; #else static constexpr size_t N_EXP10F16_EXCEPTS = 8; @@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues {0xbf0aU, 0x2473U, 1U, 0U, 0U}, // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ) {0xc387U, 0x09a5U, 1U, 0U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ) {0x4030U, 0x57c1U, 1U, 0U, 1U}, // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ) diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp index 449aedf254ca5..41e2c2bb14b04 100644 --- a/libc/src/math/generic/exp10m1f16.cpp +++ b/libc/src/math/generic/exp10m1f16.cpp @@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues EXP10M1F16_EXCEPTS_LO = {{ {0x9788U, 0x9c53U, 0U, 1U, 0U}, }}; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3; #else static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6; @@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues {0x3657U, 0x3df6U, 1U, 0U, 0U}, // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ) {0x3741U, 0x3f5cU, 1U, 0U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ) {0x4030U, 0x57b1U, 1U, 0U, 1U}, // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ) diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp index 2c612777c9cb5..726f88b6457fc 100644 --- a/libc/src/math/generic/exp2.cpp +++ b/libc/src/math/generic/exp2.cpp @@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128; // Error bounds: // Errors when using double precision. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE constexpr double ERR_D = 0x1.0p-63; #else constexpr double ERR_D = 0x1.8p-63; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Errors when using double-double precision. diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp index 6a1cd2328a050..eceb76f1893e2 100644 --- a/libc/src/math/generic/exp2m1f16.cpp +++ b/libc/src/math/generic/exp2m1f16.cpp @@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues EXP2M1F16_EXCEPTS_LO = {{ {0x973fU, 0x9505U, 0U, 1U, 0U}, }}; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6; #else static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7; @@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues // (input, RZ output, RU offset, RD offset, RN offset) // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ) {0x3396U, 0x31b7U, 1U, 0U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ) {0x34baU, 0x3345U, 1U, 0U, 0U}, #endif // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ) {0x36b6U, 0x3566U, 1U, 0U, 0U}, -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ) {0x37b7U, 0x3659U, 1U, 0U, 1U}, #endif @@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues {0xb3ccU, 0xb0f9U, 0U, 1U, 0U}, // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ) {0xb8a5U, 0xb54cU, 0U, 1U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ) {0xba8dU, 0xb6edU, 0U, 1U, 1U}, #endif diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp index d5e9e85ed4bd3..1e44e943d9258 100644 --- a/libc/src/math/generic/expm1f.cpp +++ b/libc/src/math/generic/expm1f.cpp @@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return 0x1.8dbe62p-3f; } -#if !defined(LIBC_TARGET_CPU_HAS_FMA) +#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE) if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f int round_mode = fputil::quick_get_round(); if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD) return -0x1.71c884p-4f; return -0x1.71c882p-4f; } -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // When |x| > 25*log(2), or nan if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) { @@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { // 2^-76. For targets without FMA instructions, we simply use double for // intermediate results as it is more efficient than using an emulated // version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) - return fputil::fma(x, x, x); +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) + return fputil::multiply_add(x, x, x); #else double xd = x; return static_cast(fputil::multiply_add(xd, xd, xd)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } constexpr double COEFFS[] = {0x1p-1, diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp index 4ce0efd1f461b..bfd263eaa9cb0 100644 --- a/libc/src/math/generic/expm1f16.cpp +++ b/libc/src/math/generic/expm1f16.cpp @@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues EXPM1F16_EXCEPTS_LO = {{ {0x2959U, 0x2975U, 1U, 0U, 1U}, }}; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2; #else static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3; @@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues {0x3f0dU, 0x44d3U, 1U, 0U, 1U}, // x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ) {0xb38aU, 0xb2b7U, 0U, 1U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ) {0x3282U, 0x3337U, 1U, 0U, 0U}, #endif diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp index e759e48cd6989..daad64873f27a 100644 --- a/libc/src/math/generic/fmul.cpp +++ b/libc/src/math/generic/fmul.cpp @@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) { // correctly rounded for all rounding modes, so we fall // back to the generic `fmul` implementation -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE return fputil::generic::mul(x, y); #else fputil::DoubleDouble prod = fputil::exact_mult(x, y); diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp index 959c0420ae149..ec48f62163a48 100644 --- a/libc/src/math/generic/hypotf.cpp +++ b/libc/src/math/generic/hypotf.cpp @@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) { // These squares are exact. double a_sq = ad * ad; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE double sum_sq = fputil::multiply_add(bd, bd, a_sq); #else double b_sq = bd * bd; @@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) { double r_d = result.get_val(); // Perform rounding correction. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq); double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq); #else diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp index 04eebab975cd5..0cd4424ee0baf 100644 --- a/libc/src/math/generic/log.cpp +++ b/libc/src/math/generic/log.cpp @@ -800,13 +800,13 @@ LLVM_LIBC_FUNCTION(double, log, (double x)) { fputil::DoubleDouble r1; // Perform exact range reduction -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE u = fputil::multiply_add(r, m, -1.0); // exact #else uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL; double c = FPBits_t(c_m).get_val(); u = fputil::multiply_add(r, m - c, CD[index]); // exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // Exact sum: // r1.hi + r1.lo = e_x * log(2)_hi - log(r)_hi + u diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp index fd8d5a8aae938..1c4e559ba083c 100644 --- a/libc/src/math/generic/log10.cpp +++ b/libc/src/math/generic/log10.cpp @@ -802,13 +802,13 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) { fputil::DoubleDouble r1; // Perform exact range reduction -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE u = fputil::multiply_add(r, m, -1.0); // exact #else uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL; double c = FPBits_t(c_m).get_val(); u = fputil::multiply_add(r, m - c, CD[index]); // exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // Error of u_sq = ulp(u^2); u_sq = u * u; diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp index c635fa4ef9b63..73ca26374e4a3 100644 --- a/libc/src/math/generic/log10f.cpp +++ b/libc/src/math/generic/log10f.cpp @@ -145,7 +145,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return fputil::round_result_slightly_up(-0x1.dd2c6ep-5f); case 0x3f80'70d8U: // x = 0x1.00e1bp0f return fputil::round_result_slightly_up(0x1.8762c4p-10f); -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE case 0x08ae'a356U: // x = 0x1.5d46acp-110f return fputil::round_result_slightly_up(-0x1.07d3b4p+5f); case 0x120b'93dcU: // x = 0x1.1727b8p-91f @@ -156,7 +156,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return fputil::round_result_slightly_down(0x1.2c9314p+3f); case 0x7956'ba5eU: // x = 69683218960000541503257137270226944.0 return fputil::round_result_slightly_up(0x1.16bebap+5f); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } } @@ -194,12 +194,12 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) { float u = xbits.get_val(); double v; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT v = static_cast(fputil::multiply_add(u, R[index], -1.0f)); // Exact. #else v = fputil::multiply_add(static_cast(u), static_cast(R[index]), -1.0); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT // Degree-5 polynomial approximation of log10 generated by: // > P = fpminimax(log10(1 + x)/x, 4, [|D...|], [-2^-8, 2^-7]); diff --git a/libc/src/math/generic/log10f16.cpp b/libc/src/math/generic/log10f16.cpp index 990bcabaf6871..c7cb99e1d4691 100644 --- a/libc/src/math/generic/log10f16.cpp +++ b/libc/src/math/generic/log10f16.cpp @@ -23,7 +23,7 @@ namespace LIBC_NAMESPACE_DECL { -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_LOG10F16_EXCEPTS = 11; #else static constexpr size_t N_LOG10F16_EXCEPTS = 17; @@ -36,7 +36,7 @@ static constexpr fputil::ExceptValues {0x338fU, 0xb903U, 0U, 1U, 0U}, // x = 0x1.fep-3, log10f16(x) = -0x1.35p-1 (RZ) {0x33f8U, 0xb8d4U, 0U, 1U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.394p-1, log10f16(x) = -0x1.b4cp-3 (RZ) {0x38e5U, 0xb2d3U, 0U, 1U, 1U}, #endif @@ -47,7 +47,7 @@ static constexpr fputil::ExceptValues // x = 0x1.f3p-1, log10f16(x) = -0x1.6dcp-7 (RZ) {0x3bccU, 0xa1b7U, 0U, 1U, 1U}, // x = 0x1.f38p-1, log10f16(x) = -0x1.5f8p-7 (RZ) -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT {0x3bceU, 0xa17eU, 0U, 1U, 1U}, // x = 0x1.fd8p-1, log10f16(x) = -0x1.168p-9 (RZ) {0x3bf6U, 0x985aU, 0U, 1U, 1U}, diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp index b1f02164b6a28..058409fed081d 100644 --- a/libc/src/math/generic/log1p.cpp +++ b/libc/src/math/generic/log1p.cpp @@ -1009,7 +1009,7 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { fputil::DoubleDouble v_lo = fputil::exact_mult(m_dd.lo, r); // Perform exact range reduction -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE v_hi = fputil::multiply_add(r, m_dd.hi, -1.0); // Exact. #else // c = 1 + idx * 2^-7. @@ -1017,7 +1017,7 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { uint64_t(0x3FF0'0000'0000'0000ULL)) .get_val(); v_hi = fputil::multiply_add(r, m_dd.hi - c, RCM1[idx]); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // Range reduction output: // -0x1.69000000000edp-8 < v_hi + v_lo < 0x1.7f00000000081p-8 diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp index 869cb077cc434..442b00144104b 100644 --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -108,7 +108,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { fputil::set_errno_if_required(ERANGE); fputil::raise_except_if_required(FE_DIVBYZERO); return FPBits::inf(Sign::NEG).get_val(); -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE case 0x4cc1c80bU: // x = 0x1.839016p+26f return fputil::round_result_slightly_down(0x1.26fc04p+4f); case 0x5ee8984eU: // x = 0x1.d1309cp+62f @@ -117,7 +117,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return fputil::round_result_slightly_up(0x1.af66cp+5f); case 0x79e7ec37U: // x = 0x1.cfd86ep+116f return fputil::round_result_slightly_up(0x1.43ff6ep+6f); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } return internal::log(xd + 1.0); diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp index f46ff724a4f37..27ca2fc350f17 100644 --- a/libc/src/math/generic/log2.cpp +++ b/libc/src/math/generic/log2.cpp @@ -915,13 +915,13 @@ LLVM_LIBC_FUNCTION(double, log2, (double x)) { fputil::DoubleDouble r1; // Perform exact range reduction -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE u = fputil::multiply_add(r, m, -1.0); // exact #else uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL; double c = FPBits_t(c_m).get_val(); u = fputil::multiply_add(r, m - c, CD[index]); // exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // Exact sum: // r1.hi + r1.lo = e_x * log(2)_hi - log(r)_hi + u diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp index 111f3f130bcab..b25ec41f277b6 100644 --- a/libc/src/math/generic/log2f.cpp +++ b/libc/src/math/generic/log2f.cpp @@ -97,11 +97,11 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) { float u = xbits.get_val(); double v; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT v = static_cast(fputil::multiply_add(u, R[index], -1.0f)); // Exact. #else v = fputil::multiply_add(static_cast(u), RD[index], -1.0); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT double extra_factor = static_cast(m) + LOG2_R[index]; diff --git a/libc/src/math/generic/log2f16.cpp b/libc/src/math/generic/log2f16.cpp index ff4e0268b53d0..70d592c1976d7 100644 --- a/libc/src/math/generic/log2f16.cpp +++ b/libc/src/math/generic/log2f16.cpp @@ -23,7 +23,7 @@ namespace LIBC_NAMESPACE_DECL { -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_LOG2F16_EXCEPTS = 2; #else static constexpr size_t N_LOG2F16_EXCEPTS = 9; @@ -32,7 +32,7 @@ static constexpr size_t N_LOG2F16_EXCEPTS = 9; static constexpr fputil::ExceptValues LOG2F16_EXCEPTS = {{ // (input, RZ output, RU offset, RD offset, RN offset) -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.224p-1, log2f16(x) = -0x1.a34p-1 (RZ) {0x3889U, 0xba8dU, 0U, 1U, 0U}, // x = 0x1.e34p-1, log2f16(x) = -0x1.558p-4 (RZ) @@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues #endif // x = 0x1.e8cp-1, log2f16(x) = -0x1.128p-4 (RZ) {0x3ba3U, 0xac4aU, 0U, 1U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.f98p-1, log2f16(x) = -0x1.2ep-6 (RZ) {0x3be6U, 0xa4b8U, 0U, 1U, 0U}, // x = 0x1.facp-1, log2f16(x) = -0x1.e7p-7 (RZ) @@ -48,7 +48,7 @@ static constexpr fputil::ExceptValues #endif // x = 0x1.fb4p-1, log2f16(x) = -0x1.b88p-7 (RZ) {0x3bedU, 0xa2e2U, 0U, 1U, 1U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.fecp-1, log2f16(x) = -0x1.cep-9 (RZ) {0x3bfbU, 0x9b38U, 0U, 1U, 1U}, // x = 0x1.ffcp-1, log2f16(x) = -0x1.714p-11 (RZ) diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp index 30c00edafe21d..9ed44cdc04226 100644 --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -104,14 +104,14 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) { return round_result_slightly_down(0x1.08b512p+6f); case 0x7a17f30aU: // x = 0x1.2fe614p+117f return round_result_slightly_up(0x1.451436p+6f); -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE case 0x500ffb03U: // x = 0x1.1ff606p+33f return round_result_slightly_up(0x1.6fdd34p+4f); case 0x5cd69e88U: // x = 0x1.ad3d1p+58f return round_result_slightly_up(0x1.45c146p+5f); case 0x5ee8984eU: // x = 0x1.d1309cp+62f; return round_result_slightly_up(0x1.5c9442p+5f); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } // Exceptional inputs. if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) { @@ -152,11 +152,11 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) { float u = xbits.get_val(); double v; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT v = static_cast(fputil::multiply_add(u, R[index], -1.0f)); // Exact. #else v = fputil::multiply_add(static_cast(u), RD[index], -1.0); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT // Degree-5 polynomial approximation of log generated by Sollya with: // > P = fpminimax(log(1 + x)/x, 4, [|1, D...|], [-2^-8, 2^-7]); diff --git a/libc/src/math/generic/logf16.cpp b/libc/src/math/generic/logf16.cpp index 802225a810550..dd08e34270eef 100644 --- a/libc/src/math/generic/logf16.cpp +++ b/libc/src/math/generic/logf16.cpp @@ -23,7 +23,7 @@ namespace LIBC_NAMESPACE_DECL { -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT static constexpr size_t N_LOGF16_EXCEPTS = 5; #else static constexpr size_t N_LOGF16_EXCEPTS = 11; @@ -32,7 +32,7 @@ static constexpr size_t N_LOGF16_EXCEPTS = 11; static constexpr fputil::ExceptValues LOGF16_EXCEPTS = {{ // (input, RZ output, RU offset, RD offset, RN offset) -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.61cp-13, logf16(x) = -0x1.16p+3 (RZ) {0x0987U, 0xc858U, 0U, 1U, 0U}, // x = 0x1.f2p-12, logf16(x) = -0x1.e98p+2 (RZ) @@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues {0x1935U, 0xc5f9U, 0U, 1U, 0U}, // x = 0x1.5ep-8, logf16(x) = -0x1.4ecp+2 (RZ) {0x1d78U, 0xc53bU, 0U, 1U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.fdp-1, logf16(x) = -0x1.81p-8 (RZ) {0x3bf4U, 0x9e04U, 0U, 1U, 1U}, // x = 0x1.fep-1, logf16(x) = -0x1.008p-8 (RZ) @@ -52,7 +52,7 @@ static constexpr fputil::ExceptValues {0x3bfcU, 0x9801U, 0U, 1U, 0U}, // x = 0x1.ff8p-1, logf16(x) = -0x1p-10 (RZ) {0x3bfeU, 0x9400U, 0U, 1U, 1U}, -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT // x = 0x1.4c4p+1, logf16(x) = 0x1.e84p-1 (RZ) {0x4131U, 0x3ba1U, 1U, 0U, 1U}, #else diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index a2a0bb698f81a..8a12934f6c4ba 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -394,14 +394,14 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { DoubleDouble dx_c0; // Perform exact range reduction and exact product dx * c0. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE dx = fputil::multiply_add(RD[idx_x], m_x.get_val(), -1.0); // Exact dx_c0 = fputil::exact_mult(COEFFS[0], dx); #else double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val(); dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact dx_c0 = fputil::exact_mult(dx, COEFFS[0]); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE double dx2 = dx * dx; double c0 = fputil::multiply_add(dx, COEFFS[2], COEFFS[1]); diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp index 7f4417d275702..2d7deca3c77bb 100644 --- a/libc/src/math/generic/powf.cpp +++ b/libc/src/math/generic/powf.cpp @@ -165,11 +165,11 @@ alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = { }; #else -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE constexpr uint64_t ERR = 64; #else constexpr uint64_t ERR = 128; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // We choose the precision of the high part to be 53 - 24 - 8, so that when // y * (e_x + LOG2_R_DD[i].hi) is exact. @@ -851,11 +851,11 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { // log2(m_x) = log2( (1 + dx) / r ) // = log2(1 + dx) - log2(r). double dx; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT dx = static_cast(fputil::multiply_add(m_x, R[idx_x], -1.0f)); // Exact #else dx = fputil::multiply_add(static_cast(m_x), RD[idx_x], -1.0); // Exact -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT // Degree-5 polynomial approximation: // dx * P(dx) ~ log2(1 + dx) diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index 711a12219c847..f3dcdb937333c 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -20,14 +20,14 @@ namespace LIBC_NAMESPACE_DECL { -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE static constexpr unsigned SPLIT = fputil::DefaultSplit::VALUE; #else // When there is no-FMA instructions, in order to have exact product of 2 double // precision with directional roundings, we need to lower the precision of the // constants by at least 1 bit, and use a different splitting constant. static constexpr unsigned SPLIT = 28; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE using LIBC_NAMESPACE::fputil::DoubleDouble; using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>; diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp index b32486dff487c..4a58dcf4b173f 100644 --- a/libc/src/math/generic/sin.cpp +++ b/libc/src/math/generic/sin.cpp @@ -21,11 +21,11 @@ #include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE #include "range_reduction_double_fma.h" #else #include "range_reduction_double_nofma.h" -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE namespace LIBC_NAMESPACE_DECL { @@ -52,7 +52,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { if (LIBC_UNLIKELY(x == 0.0)) return x + x; // Make sure it works with FTZ/DAZ. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE return fputil::multiply_add(x, -0x1.0p-54, x); #else if (LIBC_UNLIKELY(x_e < 4)) { @@ -63,7 +63,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { return FPBits(xbits.uintval() - 1).get_val(); } return fputil::multiply_add(x, -0x1.0p-54, x); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } // No range reduction needed. k = 0; diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp index 166ce46603140..0ac2f7f997527 100644 --- a/libc/src/math/generic/sincos.cpp +++ b/libc/src/math/generic/sincos.cpp @@ -22,11 +22,11 @@ #include "src/math/generic/range_reduction_double_common.h" #include "src/math/generic/sincos_eval.h" -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE #include "range_reduction_double_fma.h" #else #include "range_reduction_double_nofma.h" -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE namespace LIBC_NAMESPACE_DECL { @@ -57,7 +57,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { } // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); *cos_x = fputil::multiply_add(x, -x, 1.0); #else @@ -71,7 +71,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { *sin_x = FPBits(xbits.uintval() - 1).get_val(); } *sin_x = fputil::multiply_add(x, -0x1.0p-54, x); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE return; } // No range reduction needed. diff --git a/libc/src/math/generic/sincos_eval.h b/libc/src/math/generic/sincos_eval.h index 6cd1da4721bf5..41a4c75849ff4 100644 --- a/libc/src/math/generic/sincos_eval.h +++ b/libc/src/math/generic/sincos_eval.h @@ -65,7 +65,7 @@ LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, double u_hi_neg_half = (-0.5) * u.hi; DoubleDouble v; -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE v.hi = fputil::multiply_add(u.hi, u_hi_neg_half, 1.0); v.lo = 1.0 - v.hi; // Exact v.lo = fputil::multiply_add(u.hi, u_hi_neg_half, v.lo); @@ -73,7 +73,7 @@ LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u, DoubleDouble u_hi_sq_neg_half = fputil::exact_mult(u.hi, u_hi_neg_half); v = fputil::exact_add(1.0, u_hi_sq_neg_half.hi); v.lo += u_hi_sq_neg_half.lo; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE // r1 ~ -1/720 + u_hi^2 / 40320 double r1 = fputil::multiply_add(u_hi_sq, 0x1.a01a01a01a01ap-16, diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp index ccaa29c10c4c6..898c8bd0f0ae9 100644 --- a/libc/src/math/generic/sincosf.cpp +++ b/libc/src/math/generic/sincosf.cpp @@ -130,14 +130,14 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) { // |x| < 2^-125. For targets without FMA instructions, we simply use // double for intermediate results as it is more efficient than using an // emulated version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) *sinp = fputil::multiply_add(x, -0x1.0p-25f, x); *cosp = fputil::multiply_add(FPBits(x_abs).get_val(), -0x1.0p-25f, 1.0f); #else *sinp = static_cast(fputil::multiply_add(xd, -0x1.0p-25, xd)); *cosp = static_cast(fputil::multiply_add( static_cast(FPBits(x_abs).get_val()), -0x1.0p-25, 1.0)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT return; } diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h index 133896b5de7a3..05cab09d2089b 100644 --- a/libc/src/math/generic/sincosf16_utils.h +++ b/libc/src/math/generic/sincosf16_utils.h @@ -40,7 +40,7 @@ constexpr float SIN_K_PI_OVER_32[64] = { LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) { float kf = fputil::nearest_integer(x * 32); - y = fputil::multiply_add(x, 32.0, -kf); + y = fputil::multiply_add(x, 32.0f, -kf); return static_cast(kf); } diff --git a/libc/src/math/generic/sincosf_utils.h b/libc/src/math/generic/sincosf_utils.h index 726a5ab9b64be..6eaf820e5c1b0 100644 --- a/libc/src/math/generic/sincosf_utils.h +++ b/libc/src/math/generic/sincosf_utils.h @@ -14,7 +14,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE) #include "range_reduction_fma.h" // using namespace LIBC_NAMESPACE::fma; using LIBC_NAMESPACE::fma::FAST_PASS_BOUND; @@ -27,7 +27,7 @@ using LIBC_NAMESPACE::fma::small_range_reduction; using LIBC_NAMESPACE::generic::FAST_PASS_BOUND; using LIBC_NAMESPACE::generic::large_range_reduction; using LIBC_NAMESPACE::generic::small_range_reduction; -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE namespace LIBC_NAMESPACE_DECL { @@ -108,7 +108,7 @@ LIBC_INLINE void sincosf_eval(double xd, uint32_t x_abs, double &sin_k, // => pi * x = (k + y) * pi / 32 static LIBC_INLINE int64_t range_reduction_sincospi(double x, double &y) { double kd = fputil::nearest_integer(x * 32); - y = fputil::multiply_add(x, 32.0, -kd); + y = fputil::multiply_add(x, 32.0, -kd); return static_cast(kd); } diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp index cea267d4c683e..da188e5df557e 100644 --- a/libc/src/math/generic/sinf.cpp +++ b/libc/src/math/generic/sinf.cpp @@ -19,7 +19,7 @@ #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE) #include "range_reduction_fma.h" #else #include "range_reduction.h" @@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) { // |x| < 2^-125. For targets without FMA instructions, we simply use // double for intermediate results as it is more efficient than using an // emulated version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(x, -0x1.0p-25f, x); #else return static_cast(fputil::multiply_add(xd, -0x1.0p-25, xd)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } // |x| < pi/16. diff --git a/libc/src/math/generic/tan.cpp b/libc/src/math/generic/tan.cpp index 19d31a8441efb..a899a2128d384 100644 --- a/libc/src/math/generic/tan.cpp +++ b/libc/src/math/generic/tan.cpp @@ -22,11 +22,11 @@ #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA #include "src/math/generic/range_reduction_double_common.h" -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE #include "range_reduction_double_fma.h" #else #include "range_reduction_double_nofma.h" -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE namespace LIBC_NAMESPACE_DECL { @@ -140,7 +140,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { if (LIBC_UNLIKELY(x == 0.0)) return x + x; // Make sure it works with FTZ/DAZ. -#ifdef LIBC_TARGET_CPU_HAS_FMA +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE return fputil::multiply_add(x, 0x1.0p-54, x); #else if (LIBC_UNLIKELY(x_e < 4)) { @@ -150,7 +150,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { return FPBits(xbits.uintval() + 1).get_val(); } return fputil::multiply_add(x, 0x1.0p-54, x); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE } // No range reduction needed. k = 0; diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp index 6fd5f9a103676..f4f7e08838d81 100644 --- a/libc/src/math/generic/tanf.cpp +++ b/libc/src/math/generic/tanf.cpp @@ -74,11 +74,11 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) { // |x| < 2^-125. For targets without FMA instructions, we simply use // double for intermediate results as it is more efficient than using an // emulated version of FMA. -#if defined(LIBC_TARGET_CPU_HAS_FMA) +#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT) return fputil::multiply_add(x, 0x1.0p-25f, x); #else return static_cast(fputil::multiply_add(xd, 0x1.0p-25, xd)); -#endif // LIBC_TARGET_CPU_HAS_FMA +#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT } // |x| < pi/32