[libc] Provide more fine-grained control of FMA instruction for ARM targets. #130700

lntue · 2025-03-11T02:40:38Z

No description provided.

…argets.

llvmbot · 2025-03-11T02:41:16Z

@llvm/pr-subscribers-libc

Author: None (lntue)

Changes

Patch is 42.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130700.diff

41 Files Affected:

(modified) libc/src/__support/FPUtil/FMA.h (+5)
(modified) libc/src/__support/FPUtil/double_double.h (+28-8)
(modified) libc/src/__support/FPUtil/multiply_add.h (+4)
(modified) libc/src/__support/macros/properties/cpu_features.h (+15)
(modified) libc/src/math/generic/asinf.cpp (+2-2)
(modified) libc/src/math/generic/atan2f.cpp (+2-2)
(modified) libc/src/math/generic/atanf.cpp (+2-2)
(modified) libc/src/math/generic/cbrt.cpp (+1-1)
(modified) libc/src/math/generic/cos.cpp (+2-2)
(modified) libc/src/math/generic/cosf.cpp (+2-2)
(modified) libc/src/math/generic/cospif.cpp (+2-2)
(modified) libc/src/math/generic/exp10f16.cpp (+2-2)
(modified) libc/src/math/generic/exp10m1f16.cpp (+2-2)
(modified) libc/src/math/generic/exp2.cpp (+2-2)
(modified) libc/src/math/generic/exp2m1f16.cpp (+4-4)
(modified) libc/src/math/generic/expm1f.cpp (+5-5)
(modified) libc/src/math/generic/expm1f16.cpp (+2-2)
(modified) libc/src/math/generic/fmul.cpp (+1-1)
(modified) libc/src/math/generic/hypotf.cpp (+2-2)
(modified) libc/src/math/generic/log.cpp (+2-2)
(modified) libc/src/math/generic/log10.cpp (+2-2)
(modified) libc/src/math/generic/log10f.cpp (+4-4)
(modified) libc/src/math/generic/log10f16.cpp (+3-3)
(modified) libc/src/math/generic/log1p.cpp (+2-2)
(modified) libc/src/math/generic/log1pf.cpp (+2-2)
(modified) libc/src/math/generic/log2.cpp (+2-2)
(modified) libc/src/math/generic/log2f.cpp (+2-2)
(modified) libc/src/math/generic/log2f16.cpp (+4-4)
(modified) libc/src/math/generic/logf.cpp (+4-4)
(modified) libc/src/math/generic/logf16.cpp (+4-4)
(modified) libc/src/math/generic/pow.cpp (+2-2)
(modified) libc/src/math/generic/powf.cpp (+4-4)
(modified) libc/src/math/generic/range_reduction_double_common.h (+2-2)
(modified) libc/src/math/generic/sin.cpp (+4-4)
(modified) libc/src/math/generic/sincos.cpp (+4-4)
(modified) libc/src/math/generic/sincos_eval.h (+2-2)
(modified) libc/src/math/generic/sincosf.cpp (+2-2)
(modified) libc/src/math/generic/sincosf_utils.h (+2-2)
(modified) libc/src/math/generic/sinf.cpp (+3-3)
(modified) libc/src/math/generic/tan.cpp (+4-4)
(modified) libc/src/math/generic/tanf.cpp (+2-2)

diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index 1e40d06dc1462..2cafb4c0974e3 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -24,6 +24,8 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
 }
 
 #ifdef LIBC_TARGET_CPU_HAS_FMA
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 template <> LIBC_INLINE float fma(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
@@ -31,7 +33,9 @@ template <> LIBC_INLINE float fma(float x, float y, float z) {
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 template <> LIBC_INLINE double fma(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
@@ -39,6 +43,7 @@ template <> LIBC_INLINE double fma(double x, double y, double z) {
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index b24ffd4aa456f..c27885aadc028 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
   return r;
 }
 
+// The templated exact multiplication needs template version of
+// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
+// These can be moved to "src/__support/macros/properties/cpu_features.h" if
+// other part of libc needed.
+template <typename T> struct TargetHasFmaInstruction {
+  static constexpr bool VALUE = false;
+};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+template <> struct TargetHasFmaInstruction<float> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+template <> struct TargetHasFmaInstruction<double> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
 // Note: When FMA instruction is not available, the `exact_mult` function is
 // only correct for round-to-nearest mode.  See:
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
 LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
   NumberPair<T> r{0.0, 0.0};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  r.hi = a * b;
-  r.lo = fputil::multiply_add(a, b, -r.hi);
-#else
-  // Dekker's Product.
-  NumberPair<T> as = split(a);
+  if constexpr (TargetHasFmaInstruction<T>::VALUE) {
+    r.hi = a * b;
+    r.lo = fputil::multiply_add(a, b, -r.hi);
+  } else {
+    // Dekker's Product.
+    NumberPair<T> as = split(a);
 
-  r = exact_mult<T, SPLIT_B>(as, a, b);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+    r = exact_mult<T, SPLIT_B>(as, a, b);
+  }
 
   return r;
 }
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
index ae00e08673d08..8260702e2c9f4 100644
--- a/libc/src/__support/FPUtil/multiply_add.h
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -46,6 +46,7 @@ multiply_add(T x, T y, T z) {
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 LIBC_INLINE float multiply_add(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
@@ -53,7 +54,9 @@ LIBC_INLINE float multiply_add(float x, float y, float z) {
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 LIBC_INLINE double multiply_add(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
@@ -61,6 +64,7 @@ LIBC_INLINE double multiply_add(double x, double y, double z) {
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index d2cea367516db..1714775ca334d 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -45,6 +45,21 @@
 #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) ||   \
     defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
 #define LIBC_TARGET_CPU_HAS_FMA
+// Provide a more fine-grained control of FMA instruction for ARM targets.
+#if defined(__ARM_FP)
+#if (__ARM_FP & 0x2)
+#define LIBC_TARGET_CPU_HAS_FMA_HALF
+#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
+#if (__ARM_FP & 0x4)
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#if (__ARM_FP & 0x8)
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#else
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif
 #endif
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index 3a89def8f6e0c..b54a9e7b2b00b 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       // |x| < 2^-125. For targets without FMA instructions, we simply use
       // double for intermediate results as it is more efficient than using an
       // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, 0x1.0p-25f, x);
 #else
       double xd = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
 
     // Check for exceptional values
diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index 5ac2b29438ea9..726cae9c8462b 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
     num_r = num_d;
     den_r = den_d;
   }
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
 #else
   // Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
@@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
   double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
   double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
   q.lo = t2 / den_r;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // Taylor polynomial, evaluating using Horner's scheme:
   //   P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index 5e0788efbeb88..46196dbe4162c 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
       return x;
     // x <= 2^-12;
     if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, -0x1.0p-25f, x);
 #else
       double x_d = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
     // Use Taylor polynomial:
     //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp
index ee7d69b2c211f..ce227e6650c84 100644
--- a/libc/src/math/generic/cbrt.cpp
+++ b/libc/src/math/generic/cbrt.cpp
@@ -58,7 +58,7 @@ double intial_approximation(double x) {
 
 // Get the error term for Newton iteration:
 //   h(x) = x^3 * a^2 - 1,
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
   return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
          fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
index 568b1254c6f02..b60082bf9c308 100644
--- a/libc/src/math/generic/cos.cpp
+++ b/libc/src/math/generic/cos.cpp
@@ -20,11 +20,11 @@
 #include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #include "range_reduction_double_fma.h"
 #else
 #include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index 972ffa923aedf..23e3db067e669 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
     // |x| < 2^-125. For targets without FMA instructions, we simply use
     // double for intermediate results as it is more efficient than using an
     // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
 #else
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
   if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp
index 4ef1539539921..29566f4fceacf 100644
--- a/libc/src/math/generic/cospif.cpp
+++ b/libc/src/math/generic/cospif.cpp
@@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
   // The exhautive test passes for smaller values
   if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {
 
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
 #else
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
   // Numbers greater or equal to 2^23 are always integers or NaN
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 006dd5c554428..f2002e9f146c0 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -26,7 +26,7 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP10F16_EXCEPTS = 5;
 #else
 static constexpr size_t N_EXP10F16_EXCEPTS = 8;
@@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
         {0xbf0aU, 0x2473U, 1U, 0U, 0U},
         // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
         {0xc387U, 0x09a5U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
         {0x4030U, 0x57c1U, 1U, 0U, 1U},
         // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 449aedf254ca5..41e2c2bb14b04 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
     {0x9788U, 0x9c53U, 0U, 1U, 0U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
 #else
 static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
@@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
         {0x3657U, 0x3df6U, 1U, 0U, 0U},
         // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
         {0x3741U, 0x3f5cU, 1U, 0U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
         {0x4030U, 0x57b1U, 1U, 0U, 1U},
         // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 2c612777c9cb5..726f88b6457fc 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128;
 
 // Error bounds:
 // Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 constexpr double ERR_D = 0x1.0p-63;
 #else
 constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // Errors when using double-double precision.
diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp
index 6a1cd2328a050..eceb76f1893e2 100644
--- a/libc/src/math/generic/exp2m1f16.cpp
+++ b/libc/src/math/generic/exp2m1f16.cpp
@@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
     {0x973fU, 0x9505U, 0U, 1U, 0U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
 #else
 static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
@@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
         // (input, RZ output, RU offset, RD offset, RN offset)
         // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
         {0x3396U, 0x31b7U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
         {0x34baU, 0x3345U, 1U, 0U, 0U},
 #endif
         // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
         {0x36b6U, 0x3566U, 1U, 0U, 0U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
         {0x37b7U, 0x3659U, 1U, 0U, 1U},
 #endif
@@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
         {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
         // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
         {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
         {0xba8dU, 0xb6edU, 0U, 1U, 1U},
 #endif
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index d5e9e85ed4bd3..655f0e6246676 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
     return 0x1.8dbe62p-3f;
   }
 
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
+#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
   if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
     int round_mode = fputil::quick_get_round();
     if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
       return -0x1.71c884p-4f;
     return -0x1.71c882p-4f;
   }
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // When |x| > 25*log(2), or nan
   if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) {
@@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         // 2^-76. For targets without FMA instructions, we simply use double for
         // intermediate results as it is more efficient than using an emulated
         // version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
-      return fputil::fma<float>(x, x, x);
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+      return fputil::multiply_add<float>(x, x, x);
 #else
       double xd = x;
       return static_cast<float>(fputil::multiply_add(xd, xd, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
 
     constexpr double COEFFS[] = {0x1p-1,
diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp
index 4ce0efd1f461b..bfd263eaa9cb0 100644
--- a/libc/src/math/generic/expm1f16.cpp
+++ b/libc/src/math/generic/expm1f16.cpp
@@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
     {0x2959U, 0x2975U, 1U, 0U, 1U},
 }};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
 #else
 static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
@@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
         {0x3f0dU, 0x44d3U, 1U, 0U, 1U},
         // x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
         {0xb38aU, 0xb2b7U, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
         // x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
         {0x3282U, 0x3337U, 1U, 0U, 0U},
 #endif
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
index e759e48cd6989..daad64873f27a 100644
--- a/libc/src/math/generic/fmul.cpp
+++ b/libc/src/math/generic/fmul.cpp
@@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
   // correctly rounded for all rounding modes, so we fall
   // back to the generic `fmul` implementation
 
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   return fputil::generic::mul<float>(x, y);
 #else
   fputil::DoubleDouble prod = fputil::exact_mult(x, y);
diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp
index 959c0420ae149..ec48f62163a48 100644
--- a/libc/src/math/generic/hypotf.cpp
+++ b/libc/src/math/generic/hypotf.cpp
@@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
 
   // These squares are exact.
   double a_sq = ad * ad;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   double sum_sq = fputil::multiply_add(bd, bd, a_sq);
 #else
   double b_sq = bd * bd;
@@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
     double r_d = result.get_val();
 
     // Perform rounding correction.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
     double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
     double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
 #else
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 04eebab975cd5..0cd4424ee0baf 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -800,13 +800,13 @@ LLVM_LIBC_FUNCTION(double, log, (double x)) {
   fputil::DoubleDouble r1;
 
   // Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   u = fputil::multiply_add(r, m, -1.0); // exact
 #else
   uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL;
   double c = FPBits_t(c_m).get_val();
   u = fputil::multiply_add(r, m - c, CD[index]); // exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // Exact sum:
   //   r1.hi + r1.lo = e_x * log(2)_hi - log(r)_hi + u
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index fd8d5a8aae938..1c4e559ba083c 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -802,13 +802,13 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) {
   fputil::DoubleDouble r1;
 
   // Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   u = fputil::multiply_add(r, m, -1.0); // exact
 #else
   uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL;
   double c = FPBits_t(c_m).get_val();
   u = fputil::multiply_add(r, m - c, CD[index]); // exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // Error of u_sq = ulp(u^2);
   u_sq = u * u;
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index c635fa4ef9b63..73ca26374e4a3 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -145,7 +145,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
       return fputil::round_result_slightly_up(-0x1.dd2...
[truncated]

jhuber6

Seems pretty mechanical, guessing this is for architectures that may have one and not the other.

[libc] Provide more fine-grained control of FMA instruction for ARM t…

d8704b4

…argets.

lntue requested a review from petrhosek March 11, 2025 02:40

llvmbot added the libc label Mar 11, 2025

lntue requested a review from Prabhuk March 11, 2025 02:44

jhuber6 approved these changes Mar 11, 2025

View reviewed changes

Fix few multiply_add calls with templated versions.

7f5f016

lntue merged commit 5038e2f into llvm:main Mar 11, 2025
16 checks passed

lntue deleted the arm branch March 11, 2025 14:18

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[libc] Provide more fine-grained control of FMA instruction for ARM targets. #130700

[libc] Provide more fine-grained control of FMA instruction for ARM targets. #130700

Uh oh!

lntue commented Mar 11, 2025

Uh oh!

llvmbot commented Mar 11, 2025

Uh oh!

jhuber6 left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[libc] Provide more fine-grained control of FMA instruction for ARM targets. #130700

[libc] Provide more fine-grained control of FMA instruction for ARM targets. #130700

Uh oh!

Conversation

lntue commented Mar 11, 2025

Uh oh!

llvmbot commented Mar 11, 2025

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants