llvm · lntue · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
@@ -24,21 +24,26 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
 }
 
 #ifdef LIBC_TARGET_CPU_HAS_FMA
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 template <> LIBC_INLINE float fma(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 template <> LIBC_INLINE double fma(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
 } // namespace fputil

diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
@@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
   return r;
 }
 
+// The templated exact multiplication needs template version of
+// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
+// These can be moved to "src/__support/macros/properties/cpu_features.h" if
+// other part of libc needed.
+template <typename T> struct TargetHasFmaInstruction {
+  static constexpr bool VALUE = false;
+};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+template <> struct TargetHasFmaInstruction<float> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+template <> struct TargetHasFmaInstruction<double> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
 // Note: When FMA instruction is not available, the `exact_mult` function is
 // only correct for round-to-nearest mode.  See:
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
 LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
   NumberPair<T> r{0.0, 0.0};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  r.hi = a * b;
-  r.lo = fputil::multiply_add(a, b, -r.hi);
-#else
-  // Dekker's Product.
-  NumberPair<T> as = split(a);
+  if constexpr (TargetHasFmaInstruction<T>::VALUE) {
+    r.hi = a * b;
+    r.lo = fputil::multiply_add(a, b, -r.hi);
+  } else {
+    // Dekker's Product.
+    NumberPair<T> as = split(a);
 
-  r = exact_mult<T, SPLIT_B>(as, a, b);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+    r = exact_mult<T, SPLIT_B>(as, a, b);
+  }
 
   return r;
 }

diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
@@ -46,21 +46,25 @@ multiply_add(T x, T y, T z) {
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 LIBC_INLINE float multiply_add(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 LIBC_INLINE double multiply_add(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL

diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
@@ -45,6 +45,21 @@
 #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) ||   \
     defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
 #define LIBC_TARGET_CPU_HAS_FMA
+// Provide a more fine-grained control of FMA instruction for ARM targets.
+#if defined(__ARM_FP)
+#if (__ARM_FP & 0x2)
+#define LIBC_TARGET_CPU_HAS_FMA_HALF
+#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
+#if (__ARM_FP & 0x4)
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#if (__ARM_FP & 0x8)
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#else
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif
 #endif
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \

diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
@@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       // |x| < 2^-125. For targets without FMA instructions, we simply use
       // double for intermediate results as it is more efficient than using an
       // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, 0x1.0p-25f, x);
 #else
       double xd = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
 
     // Check for exceptional values

diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
@@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
     num_r = num_d;
     den_r = den_d;
   }
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
 #else
   // Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
@@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
   double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
   double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
   q.lo = t2 / den_r;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // Taylor polynomial, evaluating using Horner's scheme:
   //   P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15

diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
@@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
       return x;
     // x <= 2^-12;
     if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, -0x1.0p-25f, x);
 #else
       double x_d = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
     // Use Taylor polynomial:
     //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).

diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp
@@ -58,7 +58,7 @@ double intial_approximation(double x) {
 
 // Get the error term for Newton iteration:
 //   h(x) = x^3 * a^2 - 1,
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
   return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
          fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);

diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
@@ -20,11 +20,11 @@
 #include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #include "range_reduction_double_fma.h"
 #else
 #include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 namespace LIBC_NAMESPACE_DECL {
 

diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
     // |x| < 2^-125. For targets without FMA instructions, we simply use
     // double for intermediate results as it is more efficient than using an
     // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
 #else
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
   if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))

diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp
@@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
   // The exhautive test passes for smaller values
   if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {
 
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
 #else
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
   // Numbers greater or equal to 2^23 are always integers or NaN

diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
@@ -26,7 +26,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP10F16_EXCEPTS = 5;
 #else
 static constexpr size_t N_EXP10F16_EXCEPTS = 8;
@@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
         {0xbf0aU, 0x2473U, 1U, 0U, 0U},
         // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
         {0xc387U, 0x09a5U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
         {0x4030U, 0x57c1U, 1U, 0U, 1U},
         // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)

diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
@@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
     {0x9788U, 0x9c53U, 0U, 1U, 0U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
 #else
 static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
@@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
         {0x3657U, 0x3df6U, 1U, 0U, 0U},
         // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
         {0x3741U, 0x3f5cU, 1U, 0U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
         {0x4030U, 0x57b1U, 1U, 0U, 1U},
         // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)

diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
@@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128;
 
 // Error bounds:
 // Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 constexpr double ERR_D = 0x1.0p-63;
 #else
 constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // Errors when using double-double precision.

diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp
@@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
     {0x973fU, 0x9505U, 0U, 1U, 0U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
 #else
 static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
@@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
         // (input, RZ output, RU offset, RD offset, RN offset)
         // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
         {0x3396U, 0x31b7U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
         {0x34baU, 0x3345U, 1U, 0U, 0U},
 #endif
         // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
         {0x36b6U, 0x3566U, 1U, 0U, 0U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
         {0x37b7U, 0x3659U, 1U, 0U, 1U},
 #endif
@@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
         {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
         // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
         {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
         {0xba8dU, 0xb6edU, 0U, 1U, 1U},
 #endif

diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
@@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
     return 0x1.8dbe62p-3f;
   }
 
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
+#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
   if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
     int round_mode = fputil::quick_get_round();
     if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
       return -0x1.71c884p-4f;
     return -0x1.71c882p-4f;
   }
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // When |x| > 25*log(2), or nan
   if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) {
@@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         // 2^-76. For targets without FMA instructions, we simply use double for
         // intermediate results as it is more efficient than using an emulated
         // version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
-      return fputil::fma<float>(x, x, x);
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+      return fputil::multiply_add<float>(x, x, x);
 #else
       double xd = x;
       return static_cast<float>(fputil::multiply_add(xd, xd, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
 
     constexpr double COEFFS[] = {0x1p-1,

diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp
@@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
     {0x2959U, 0x2975U, 1U, 0U, 1U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
 #else
 static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
@@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
         {0x3f0dU, 0x44d3U, 1U, 0U, 1U},
         // x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
         {0xb38aU, 0xb2b7U, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
         {0x3282U, 0x3337U, 1U, 0U, 0U},
 #endif

diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
@@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
   // correctly rounded for all rounding modes, so we fall
   // back to the generic `fmul` implementation
 
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   return fputil::generic::mul<float>(x, y);
 #else
   fputil::DoubleDouble prod = fputil::exact_mult(x, y);

diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp
@@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
 
   // These squares are exact.
   double a_sq = ad * ad;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   double sum_sq = fputil::multiply_add(bd, bd, a_sq);
 #else
   double b_sq = bd * bd;
@@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
     double r_d = result.get_val();
 
     // Perform rounding correction.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
     double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
     double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
 #else