Improve exp() efficiency

ckormanyos · ckormanyos · commit 6102696f9803 · 2024-05-05T12:59:14.000+02:00
diff --git a/include/boost/decimal/detail/cmath/exp.hpp b/include/boost/decimal/detail/cmath/exp.hpp
@@ -76,7 +76,7 @@ constexpr auto exp_impl(T x) noexcept
                 x -= numbers::ln2_v<T> * nf2;
             }
 
-            result = detail::exp_pade_appxroximant(x);
+            result = detail::exp_pade_appxroximant_or_series(x);
 
             if (nf2 > 0)
             {
diff --git a/include/boost/decimal/detail/cmath/impl/exp_impl.hpp b/include/boost/decimal/detail/cmath/impl/exp_impl.hpp
@@ -7,6 +7,7 @@
 #define BOOST_DECIMAL_DETAIL_CMATH_IMPL_EXP_IMPL_HPP
 
 #include <boost/decimal/detail/concepts.hpp>
+#include <boost/decimal/detail/cmath/impl/taylor_series_result.hpp>
 
 #ifndef BOOST_DECIMAL_BUILD_MODULE
 #include <array>
@@ -18,32 +19,76 @@ namespace boost {
 namespace decimal {
 namespace detail {
 
+namespace exp_detail {
+
+template <bool b>
+struct exp_table_imp
+{
+private:
+    using d128_coeffs_t = std::array<decimal128, 17>;
+
+public:
+    static constexpr d128_coeffs_t d128_coeffs =
+    {{
+         // Series[Exp[x] - 1, {x, 0, 18}]
+         //            (1),                                                                                                                   // * x
+         ::boost::decimal::decimal128 { 5, -1 },                                                                                              // * x^2
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(90350181040458),  UINT64_C(12964998083131386532) }, -34 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(225875452601146), UINT64_C(13965751134118914724) }, -35 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(451750905202293), UINT64_C(9484758194528277842)  }, -36 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(75291817533715),  UINT64_C(10804165069276155440) }, -36 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(107559739333879), UINT64_C(7528774067376128516)  }, -37 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(134449674167349), UINT64_C(4799281565792772746)  }, -38 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(149388526852610), UINT64_C(5332535073103080820)  }, -39 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(149388526852610), UINT64_C(5332535073103080820)  }, -40 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(135807751684191), UINT64_C(3170782423392841514)  }, -41 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(113173126403492), UINT64_C(11865690723015477068) }, -42 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(87056251079609),  UINT64_C(13384395342406417346) }, -43 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(62183036485435),  UINT64_C(9560282387433155251)  }, -44 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(414553576569570), UINT64_C(2246069003855862950)  }, -46 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(259095985355981), UINT64_C(6015479145837302244)  }, -47 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(152409403150577), UINT64_C(4623619737181327888)  }, -48 },
+         ::boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(84671890639209),  UINT64_C(10767230553416093986) }, -49 },
+    }};
+};
+
+#if !(defined(__cpp_inline_variables) && __cpp_inline_variables >= 201606L) && (!defined(_MSC_VER) || _MSC_VER != 1900)
+
+template <bool b>
+constexpr typename exp_table_imp<b>::d128_coeffs_t exp_table_imp<b>::d128_coeffs;
+
+#endif
+
+} //namespace exp_detail
+
+using exp_table = exp_detail::exp_table_imp<true>;
+
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE T>
-constexpr auto exp_pade_appxroximant(T x) noexcept;
+constexpr auto exp_pade_appxroximant_or_series(T x) noexcept;
 
 template <>
-constexpr auto exp_pade_appxroximant<decimal32>(decimal32 x) noexcept
+constexpr auto exp_pade_appxroximant_or_series<decimal32>(decimal32 x) noexcept
 {
-    // TODO: Chris: At 32-bit, reduce the number of coefficients in the Pade appxorimant of the exp() function.
-
     using local_float_t = decimal32;
 
-    // PadeApproximant[Exp[x] - 1, {x, 0, {6, 6}}]
+    // PadeApproximant[Exp[x] - 1, {x, 0, {3, 4}}]
     // FullSimplify[%]
-    //   (84 x (7920 + 240 x^2 + x^4))
-    // / (665280 + x (-332640 + x (75600 + x (-10080 + x (840 + (-42 + x) x)))))
+    //   (40 x (42 + x^2))
+    // / (1680 + x (-840 + x (180 + (-20 + x) x)))
 
     const auto x2 = x * x;
 
     // Use the small-argument Pade approximation having coefficients shown above.
-    const local_float_t top = local_float_t { UINT8_C(84), 0 } * x * ( local_float_t { UINT16_C(7920), 0 } + ( local_float_t { UINT8_C(240), 0 } + x2) * x2);
-    const local_float_t bot = local_float_t { UINT32_C(665280), 0 } + x * (local_float_t { INT32_C(-332640), 0 } + x * (local_float_t { UINT32_C(75600), 0 } + x * (local_float_t { INT16_C(-10080), 0 } + x * (local_float_t { UINT16_C(840), 0 } + (local_float_t { INT8_C(-42), 0 } + x) * x))));
+    const local_float_t top { local_float_t { UINT8_C(40), 0 } * x * (local_float_t { UINT8_C(42), 0 } + x2) };
+    const local_float_t bot { local_float_t { UINT16_C(1680), 0 } + x * (local_float_t { INT16_C(-840), 0 } + x * (local_float_t { UINT8_C(180), 0 } + (local_float_t { INT8_C(-20), 0 } + x) * x)) };
 
-    return local_float_t { 1 } + (top / bot);
+    constexpr local_float_t one { 1 };
+
+    return one + (top / bot);
 }
 
 template <>
-constexpr auto exp_pade_appxroximant<decimal64>(decimal64 x) noexcept
+constexpr auto exp_pade_appxroximant_or_series<decimal64>(decimal64 x) noexcept
 {
     using local_float_t = decimal64;
 
@@ -58,57 +103,38 @@ constexpr auto exp_pade_appxroximant<decimal64>(decimal64 x) noexcept
     const local_float_t top = local_float_t { UINT8_C(84), 0 } * x * ( local_float_t { UINT16_C(7920), 0 } + ( local_float_t { UINT8_C(240), 0 } + x2) * x2);
     const local_float_t bot = local_float_t { UINT32_C(665280), 0 } + x * (local_float_t { INT32_C(-332640), 0 } + x * (local_float_t { UINT32_C(75600), 0 } + x * (local_float_t { INT16_C(-10080), 0 } + x * (local_float_t { UINT16_C(840), 0 } + (local_float_t { INT8_C(-42), 0 } + x) * x))));
 
-    return local_float_t { 1 } + (top / bot);
+    constexpr local_float_t one { 1 };
+
+    return one + (top / bot);
 }
 
 template <>
-constexpr auto exp_pade_appxroximant<decimal128>(decimal128 x) noexcept
+constexpr auto exp_pade_appxroximant_or_series<decimal128>(decimal128 x) noexcept
 {
     // Compute exp(x) - 1 for x small.
-
-    // TODO: Does it make sense to try and improve accuracy/precision with more Pade terms?
-    // Or would a simple Tylor expansion here simply be better?
-
-    // Use an order-12 Pade approximation of the exponential function.
-    // PadeApproximant[Exp[x] - 1, {x, 0, 12, 12}].
+    // Use argument scaling in combination with a Taylor series expansion to order-18.
 
     using local_float_t = decimal128;
 
     // Rescale the argument even further (and note the three squarings below).
     x /= 8;
 
-    const local_float_t x2 = (x * x);
-
-    const local_float_t top = (((((  local_float_t { boost::decimal::detail::uint128 { UINT64_C(130576843339991), UINT64_C(2348781707059460614)  }, -46 }   * x2
-                                   + local_float_t { boost::decimal::detail::uint128 { UINT64_C(502720846858965), UINT64_C(15499169997977266440) }, -43 } ) * x2
-                                   + local_float_t { boost::decimal::detail::uint128 { UINT64_C(492264253244299), UINT64_C(6469924059228430936)  }, -40 } ) * x2
-                                   + local_float_t { boost::decimal::detail::uint128 { UINT64_C(168354374609550), UINT64_C(6971973999273187690)  }, -37 } ) * x2
-                                   + local_float_t { boost::decimal::detail::uint128 { UINT64_C(196413437044475), UINT64_C(8133969665818718980)  }, -35 } ) * x2
-                                   + local_float_t { boost::decimal::detail::uint128 { UINT64_C(54210108624275),  UINT64_C(4089650035136921600)  }, -33 } )
-                                   ;
-
-    const local_float_t bot = ((((((((((((  local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(418515523525612), UINT64_C(10839100561497421498) }, -49 } )  * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(65288421669995),  UINT64_C(10397762890384506116) }, -46 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(502720846858965), UINT64_C(15499169997977266440) }, -45 } )) * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(251360423429482), UINT64_C(16972957035843409028) }, -43 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(90489752434613),  UINT64_C(15702571451232594082) }, -41 } )) * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(246132126622149), UINT64_C(12458334066468991276) }, -40 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(516877465906514), UINT64_C(5871083058504374896)  }, -39 } )) * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(84177187304775),  UINT64_C(3485986999636593840)  }, -37 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(105221484130968), UINT64_C(18192541804827906022) }, -36 } )) * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(98206718522237),  UINT64_C(13290356869764135298) }, -35 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(64816434224676),  UINT64_C(16519268045002340975) }, -34 } )) * x
-                                          + local_float_t( -boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(271050543121376), UINT64_C(2001506101975056384)  }, -34 } )) * x
-                                          + local_float_t( +boost::decimal::decimal128 { boost::decimal::detail::uint128 { UINT64_C(54210108624275),  UINT64_C(4089650035136921600)  }, -33 } ))
-                                          ;
-
-    local_float_t result { local_float_t { 1 } + ((x * top) / bot) };
+    constexpr local_float_t one { 1, 0 };
 
-    result *= result;
+    // Note: The Taylor series expansion begins with coefficients of order-2.
+    // So we need to multiply by x^2 and add the two skipped terms (1 + x).
+
+    local_float_t
+        result
+        {
+            one + (x * (one + (x * taylor_series_result(x, exp_table::d128_coeffs))))
+        };
+
+    // Scale up with three squarings in order to obtain the result.
     result *= result;
     result *= result;
 
-    return result;
+    return result *= result;
 }
 
 } //namespace detail
diff --git a/test/test_exp.cpp b/test/test_exp.cpp
@@ -1,5 +1,5 @@
-// Copyright 2023 Matt Borland
-// Copyright 2023 Christopher Kormanyos
+// Copyright 2023 - 2024 Matt Borland
+// Copyright 2023 - 2024 Christopher Kormanyos
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
@@ -54,17 +54,31 @@ namespace local
 
     auto result_is_ok = bool { };
 
+    NumericType delta { };
+
     if(b == static_cast<NumericType>(0))
     {
-      result_is_ok = (fabs(a - b) < tol);
+      delta = fabs(a - b); // LCOV_EXCL_LINE
+
+      result_is_ok = (delta < tol); // LCOV_EXCL_LINE
     }
     else
     {
-      const auto delta = fabs(1 - (a / b));
+      delta = fabs(1 - (a / b));
 
       result_is_ok = (delta < tol);
     }
 
+    // LCOV_EXCL_START
+    if (!result_is_ok)
+    {
+      std::cerr << std::setprecision(std::numeric_limits<NumericType>::digits10) << "a: " << a
+                << "\nb: " << b
+                << "\ndelta: " << delta
+                << "\ntol: " << tol << std::endl;
+    }
+    // LCOV_EXCL_STOP
+
     return result_is_ok;
   }
 
@@ -115,9 +129,9 @@ namespace local
       if(!result_val_is_ok)
       {
           // LCOV_EXCL_START
-        std::cout << "x_flt  : " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << x_flt   << std::endl;
-        std::cout << "val_flt: " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << val_flt << std::endl;
-        std::cout << "val_dec: " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << val_dec << std::endl;
+        std::cerr << "x_flt  : " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << x_flt   << std::endl;
+        std::cerr << "val_flt: " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << val_flt << std::endl;
+        std::cerr << "val_dec: " << std::scientific << std::setprecision(std::numeric_limits<float_type>::digits10) << val_dec << std::endl;
 
         break;
           // LCOV_EXCL_STOP
@@ -213,6 +227,92 @@ namespace local
 
     return result_is_ok;
   }
+
+  auto test_exp_128(const int tol_factor) -> bool
+  {
+    using decimal_type = boost::decimal::decimal128;
+
+    using str_ctrl_array_type = std::array<const char*, 39U>;
+
+    const str_ctrl_array_type ctrl_strings =
+    {{
+       // Table[N[Exp[n/10 + n/100], 36], {n, 1, 39, 1}]
+       "1.11627807045887129150073776905298390",
+       "1.24607673058738081952026478299269624",
+       "1.39096812846378026624274780495311882",
+       "1.55270721851133604205007964619169497",
+       "1.73325301786739523682191676713732884",
+       "1.93479233440203152169312515101969168",
+       "2.15976625378491500838755239034002685",
+       "2.41089970641720985089088491613290280",
+       "2.69123447234926228909987940407101397",
+       "3.00416602394643311205840795358867239",
+       "3.35348465254902368100358942737571204",
+       "3.74342137726086256855805582982587323",
+       "4.17869919192324615658039176435293801",
+       "4.66459027098812590279338676624377783",
+       "5.20697982717984873765730709271233513",
+       "5.81243739440258864988034062444969445",
+       "6.48829639928671111502903132434912956",
+       "7.24274298516101220851243475314474762",
+       "8.08491516430506017497344071644188155",
+       "9.02501349943412092647177716688866403",
+       "10.0744246550135862002454552896844711",
+       "11.2458593148818460799615892055305690",
+       "12.5535061366682314080320232000754142",
+       "14.0132036077336131602667577975340025",
+       "15.6426318841881716102126980461566588",
+       "17.4615269365799904170450682499698346",
+       "19.4919195960311175203209452590133521",
+       "21.7584023961970778443863882601062266",
+       "24.2884274430945556043070982961719396",
+       "27.1126389206578874268183721102312223",
+       "30.2652442594000813446015323588968824",
+       "33.7844284638495538820910085630299049",
+       "37.7128166171817490996824895604598120",
+       "42.0979901649969005914744807079465071",
+       "46.9930632315792808648304762411623248",
+       "52.4573259490990503124315131185087067",
+       "58.5569625918923670285321923410850419",
+       "65.3658532140099181652435900015868107",
+       "72.9664684996328018947164376727604433",
+    }};
+
+    std::array<decimal_type, std::tuple_size<str_ctrl_array_type>::value> exp_values { };
+    std::array<decimal_type, std::tuple_size<str_ctrl_array_type>::value> ctrl_values { };
+
+    int nx { 1 };
+
+    bool result_is_ok { true };
+
+    const decimal_type my_tol { std::numeric_limits<decimal_type>::epsilon() * static_cast<decimal_type>(tol_factor) };
+
+    for(auto i = static_cast<std::size_t>(UINT8_C(0)); i < std::tuple_size<str_ctrl_array_type>::value; ++i)
+    {
+      const decimal_type
+        x_arg
+        {
+            decimal_type { nx, -1 }
+          + decimal_type { nx, -2 }
+        };
+
+      ++nx;
+
+      exp_values[i] = exp(x_arg);
+
+      static_cast<void>
+      (
+        from_chars(ctrl_strings[i], ctrl_strings[i] + std::strlen(ctrl_strings[i]), ctrl_values[i])
+      );
+
+      const auto result_exp_is_ok = is_close_fraction(exp_values[i], ctrl_values[i], my_tol);
+
+      result_is_ok = (result_exp_is_ok && result_is_ok);
+    }
+
+    return result_is_ok;
+  }
+
 } // namespace local
 
 auto main() -> int
@@ -277,6 +377,16 @@ auto main() -> int
     result_is_ok = (result_edge_is_ok && result_is_ok);
   }
 
+  {
+    using decimal_type = boost::decimal::decimal128;
+
+    const auto result_pos128_is_ok = local::test_exp_128(400000);
+
+    BOOST_TEST(result_pos128_is_ok);
+
+    result_is_ok = (result_pos128_is_ok && result_is_ok);
+  }
+
   result_is_ok = ((boost::report_errors() == 0) && result_is_ok);
 
   return (result_is_ok ? 0 : -1);

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ constexpr auto exp_impl(T x) noexcept`
`76`	`76`	`x -= numbers::ln2_v<T> * nf2;`
`77`	`77`	`}`
`78`	`78`
`79`		`- result = detail::exp_pade_appxroximant(x);`
	`79`	`+ result = detail::exp_pade_appxroximant_or_series(x);`
`80`	`80`
`81`	`81`	`if (nf2 > 0)`
`82`	`82`	`{`