GPUOpen-Drivers
diff --git a/‎libc/src/__support/FPUtil/FMA.h
Lines changed: 5 additions & 0 deletions b/‎libc/src/__support/FPUtil/FMA.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎libc/src/__support/FPUtil/double_double.h
Lines changed: 28 additions & 8 deletions b/‎libc/src/__support/FPUtil/double_double.h
Lines changed: 28 additions & 8 deletions
diff --git a/‎libc/src/__support/FPUtil/multiply_add.h
Lines changed: 4 additions & 0 deletions b/‎libc/src/__support/FPUtil/multiply_add.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎libc/src/__support/macros/properties/cpu_features.h
Lines changed: 15 additions & 0 deletions b/‎libc/src/__support/macros/properties/cpu_features.h
Lines changed: 15 additions & 0 deletions
diff --git a/‎libc/src/math/generic/asinf.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/math/generic/asinf.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/math/generic/atan2f.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/math/generic/atan2f.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/math/generic/atanf.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/math/generic/atanf.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/math/generic/cbrt.cpp
Lines changed: 1 addition & 1 deletion b/‎libc/src/math/generic/cbrt.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎libc/src/math/generic/cos.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/math/generic/cos.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/math/generic/cosf.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/math/generic/cosf.cpp
Lines changed: 2 additions & 2 deletions
@@ -24,21 +24,26 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
 }
 
 #ifdef LIBC_TARGET_CPU_HAS_FMA
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 template <> LIBC_INLINE float fma(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 template <> LIBC_INLINE double fma(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
 } // namespace fputil
 
@@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
   return r;
 }
 
+// The templated exact multiplication needs template version of
+// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
+// These can be moved to "src/__support/macros/properties/cpu_features.h" if
+// other part of libc needed.
+template <typename T> struct TargetHasFmaInstruction {
+  static constexpr bool VALUE = false;
+};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+template <> struct TargetHasFmaInstruction<float> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+template <> struct TargetHasFmaInstruction<double> {
+  static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
 // Note: When FMA instruction is not available, the `exact_mult` function is
 // only correct for round-to-nearest mode.  See:
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
 LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
   NumberPair<T> r{0.0, 0.0};
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  r.hi = a * b;
-  r.lo = fputil::multiply_add(a, b, -r.hi);
-#else
-  // Dekker's Product.
-  NumberPair<T> as = split(a);
+  if constexpr (TargetHasFmaInstruction<T>::VALUE) {
+    r.hi = a * b;
+    r.lo = fputil::multiply_add(a, b, -r.hi);
+  } else {
+    // Dekker's Product.
+    NumberPair<T> as = split(a);
 
-  r = exact_mult<T, SPLIT_B>(as, a, b);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+    r = exact_mult<T, SPLIT_B>(as, a, b);
+  }
 
   return r;
 }
 
@@ -46,21 +46,25 @@ multiply_add(T x, T y, T z) {
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
 LIBC_INLINE float multiply_add(float x, float y, float z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fmaf(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
 
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 LIBC_INLINE double multiply_add(double x, double y, double z) {
 #if __has_builtin(__builtin_elementwise_fma)
   return __builtin_elementwise_fma(x, y, z);
 #else
   return __builtin_fma(x, y, z);
 #endif
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE_DECL
 
@@ -45,6 +45,21 @@
 #if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) ||   \
     defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
 #define LIBC_TARGET_CPU_HAS_FMA
+// Provide a more fine-grained control of FMA instruction for ARM targets.
+#if defined(__ARM_FP)
+#if (__ARM_FP & 0x2)
+#define LIBC_TARGET_CPU_HAS_FMA_HALF
+#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
+#if (__ARM_FP & 0x4)
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#if (__ARM_FP & 0x8)
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#else
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif
 #endif
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
 
@@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       // |x| < 2^-125. For targets without FMA instructions, we simply use
       // double for intermediate results as it is more efficient than using an
       // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, 0x1.0p-25f, x);
 #else
       double xd = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
 
     // Check for exceptional values
 
@@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
     num_r = num_d;
     den_r = den_d;
   }
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
   q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
 #else
   // Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
@@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
   double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
   double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
   q.lo = t2 / den_r;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
   // Taylor polynomial, evaluating using Horner's scheme:
   //   P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15
 
@@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
       return x;
     // x <= 2^-12;
     if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
       return fputil::multiply_add(x, -0x1.0p-25f, x);
 #else
       double x_d = static_cast<double>(x);
       return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
     }
     // Use Taylor polynomial:
     //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
 
@@ -58,7 +58,7 @@ double intial_approximation(double x) {
 
 // Get the error term for Newton iteration:
 //   h(x) = x^3 * a^2 - 1,
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
   return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
          fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
 
@@ -20,11 +20,11 @@
 #include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #include "range_reduction_double_fma.h"
 #else
 #include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 namespace LIBC_NAMESPACE_DECL {
 
 
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
     // |x| < 2^-125. For targets without FMA instructions, we simply use
     // double for intermediate results as it is more efficient than using an
     // emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
 #else
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
   if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))