|
| 1 | +//===-- Implementation header for cbrtf -------------------------*- C++ -*-===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#ifndef LIBC_SRC___SUPPORT_MATH_CBRTF_H |
| 10 | +#define LIBC_SRC___SUPPORT_MATH_CBRTF_H |
| 11 | + |
| 12 | +#include "src/__support/FPUtil/FEnvImpl.h" |
| 13 | +#include "src/__support/FPUtil/FPBits.h" |
| 14 | +#include "src/__support/FPUtil/multiply_add.h" |
| 15 | +#include "src/__support/macros/config.h" |
| 16 | +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY |
| 17 | + |
| 18 | +namespace LIBC_NAMESPACE_DECL { |
| 19 | + |
| 20 | +namespace math { |
| 21 | + |
| 22 | +LIBC_INLINE static constexpr float cbrtf(float x) { |
| 23 | + // Look up table for 2^(i/3) for i = 0, 1, 2. |
| 24 | + constexpr double CBRT2[3] = {1.0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}; |
| 25 | + |
| 26 | + // Degree-7 polynomials approximation of ((1 + x)^(1/3) - 1)/x for 0 <= x <= 1 |
| 27 | + // generated by Sollya with: |
| 28 | + // > for i from 0 to 15 do { |
| 29 | + // P = fpminimax(((1 + x)^(1/3) - 1)/x, 6, [|D...|], [i/16, (i + 1)/16]); |
| 30 | + // print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",", |
| 31 | + // coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", |
| 32 | + // coeff(P, 6), "},"); |
| 33 | + // }; |
| 34 | + // Then (1 + x)^(1/3) ~ 1 + x * P(x). |
| 35 | + constexpr double COEFFS[16][7] = { |
| 36 | + {0x1.55555555554ebp-2, -0x1.c71c71c678c0cp-4, 0x1.f9add2776de81p-5, |
| 37 | + -0x1.511e10aa964a7p-5, 0x1.ee44165937fa2p-6, -0x1.7c5c9e059345dp-6, |
| 38 | + 0x1.047f75e0aff14p-6}, |
| 39 | + {0x1.5555554d1149ap-2, -0x1.c71c676fcb5bp-4, 0x1.f9ab127dc57ebp-5, |
| 40 | + -0x1.50ea8fd1d4c15p-5, 0x1.e9d68f28ced43p-6, -0x1.60e0e1e661311p-6, |
| 41 | + 0x1.716eca1d6e3bcp-7}, |
| 42 | + {0x1.5555546377d45p-2, -0x1.c71bc1c6d49d2p-4, 0x1.f9924cc0ed24dp-5, |
| 43 | + -0x1.4fea3beb53b3bp-5, 0x1.de028a9a07b1bp-6, -0x1.3b090d2233524p-6, |
| 44 | + 0x1.0aeca34893785p-7}, |
| 45 | + {0x1.55554dce9f649p-2, -0x1.c7188b34b98f8p-4, 0x1.f93e1af34af49p-5, |
| 46 | + -0x1.4d9a06be75c63p-5, 0x1.cb943f4f68992p-6, -0x1.139a685a5e3c4p-6, |
| 47 | + 0x1.88410674c6a5dp-8}, |
| 48 | + {0x1.5555347d211c3p-2, -0x1.c70f2a4b1a5fap-4, 0x1.f88420e8602c3p-5, |
| 49 | + -0x1.49becfa4ed3ep-5, 0x1.b475cd9013162p-6, -0x1.dcfee1dd2f8efp-7, |
| 50 | + 0x1.249bb51a1c498p-8}, |
| 51 | + {0x1.5554f01b33dbap-2, -0x1.c6facb929dbf1p-4, 0x1.f73fb7861252ep-5, |
| 52 | + -0x1.4459a4a0071fap-5, 0x1.9a8df2b504fc2p-6, -0x1.9a7ce3006d06ep-7, |
| 53 | + 0x1.ba9230918fa2ep-9}, |
| 54 | + {0x1.55545c695db5fp-2, -0x1.c6d6089f20275p-4, 0x1.f556e0ea80efp-5, |
| 55 | + -0x1.3d91372d083f4p-5, 0x1.7f66cff331f4p-6, -0x1.606a562491737p-7, |
| 56 | + 0x1.52e3e17c71069p-9}, |
| 57 | + {0x1.55534a879232ap-2, -0x1.c69b836998b84p-4, 0x1.f2bb26dac0e4cp-5, |
| 58 | + -0x1.359eed43716d7p-5, 0x1.64218cd824fbcp-6, -0x1.2e703e2e091e8p-7, |
| 59 | + 0x1.0677d9af6aad4p-9}, |
| 60 | + {0x1.5551836bb5494p-2, -0x1.c64658c15353bp-4, 0x1.ef68517451a6ep-5, |
| 61 | + -0x1.2cc20a980dceep-5, 0x1.49843e0fad93ap-6, -0x1.03c59ccb68e54p-7, |
| 62 | + 0x1.9ad325dc7adcbp-10}, |
| 63 | + {0x1.554ecacb0d035p-2, -0x1.c5d2664026ffcp-4, 0x1.eb624796ba809p-5, |
| 64 | + -0x1.233803d19a535p-5, 0x1.300decb1c3c28p-6, -0x1.befe18031ec3dp-8, |
| 65 | + 0x1.449f5ee175c69p-10}, |
| 66 | + {0x1.554ae1f5ae815p-2, -0x1.c53c6b14ff6b2p-4, 0x1.e6b2d5127bb5bp-5, |
| 67 | + -0x1.19387336788a3p-5, 0x1.180955a6ab255p-6, -0x1.81696703ba369p-8, |
| 68 | + 0x1.02cb36389bd79p-10}, |
| 69 | + {0x1.55458a59f356ep-2, -0x1.c4820dd631ae9p-4, 0x1.e167af818bd15p-5, |
| 70 | + -0x1.0ef35f6f72e52p-5, 0x1.019c33b65e4ebp-6, -0x1.4d25bdd52d3a5p-8, |
| 71 | + 0x1.a008ae91f5936p-11}, |
| 72 | + {0x1.553e878eafee1p-2, -0x1.c3a1d0b2a3db2p-4, 0x1.db90d8ed9f89bp-5, |
| 73 | + -0x1.0490e20f1ae91p-5, 0x1.d9a5d1fc42fe3p-7, -0x1.20bf8227c2abfp-8, |
| 74 | + 0x1.50f8174cdb6e9p-11}, |
| 75 | + {0x1.5535a0dedf1b1p-2, -0x1.c29afb8bd01a1p-4, 0x1.d53f6371c1e27p-5, |
| 76 | + -0x1.f463209b433e2p-6, 0x1.b35222a17e44p-7, -0x1.f5efbf505e133p-9, |
| 77 | + 0x1.12e0e94e8586dp-11}, |
| 78 | + {0x1.552aa25e57bfdp-2, -0x1.c16d811e4acadp-4, 0x1.ce8489b47aa51p-5, |
| 79 | + -0x1.dfde7ff758ea8p-6, 0x1.901f43aac38c8p-7, -0x1.b581d07df5ad5p-9, |
| 80 | + 0x1.c3726535f1fc6p-12}, |
| 81 | + {0x1.551d5d9b204d3p-2, -0x1.c019e328f8db1p-4, 0x1.c7710f44fc3cep-5, |
| 82 | + -0x1.cbbbe25ea8ba4p-6, 0x1.6fe270088623dp-7, -0x1.7e6fc79733761p-9, |
| 83 | + 0x1.75077abf18d84p-12}, |
| 84 | + }; |
| 85 | + |
| 86 | + using FloatBits = typename fputil::FPBits<float>; |
| 87 | + using DoubleBits = typename fputil::FPBits<double>; |
| 88 | + |
| 89 | + FloatBits x_bits(x); |
| 90 | + |
| 91 | + uint32_t x_abs = x_bits.uintval() & 0x7fff'ffff; |
| 92 | + uint32_t sign_bit = (x_bits.uintval() >> 31) << DoubleBits::EXP_LEN; |
| 93 | + |
| 94 | + if (LIBC_UNLIKELY(x == 0.0f || x_abs >= 0x7f80'0000)) { |
| 95 | + // x is 0, Inf, or NaN. |
| 96 | + // Make sure it works for FTZ/DAZ modes. |
| 97 | + return x + x; |
| 98 | + } |
| 99 | + |
| 100 | + double xd = static_cast<double>(x); |
| 101 | + DoubleBits xd_bits(xd); |
| 102 | + |
| 103 | + // When using biased exponent of x in double precision, |
| 104 | + // x_e = real_exponent_of_x + 1023 |
| 105 | + // Then: |
| 106 | + // x_e / 3 = real_exponent_of_x / 3 + 1023/3 |
| 107 | + // = real_exponent_of_x / 3 + 341 |
| 108 | + // So to make it the correct biased exponent of x^(1/3), we add |
| 109 | + // 1023 - 341 = 682 |
| 110 | + // to the quotient x_e / 3. |
| 111 | + unsigned x_e = static_cast<unsigned>(xd_bits.get_biased_exponent()); |
| 112 | + unsigned out_e = (x_e / 3 + 682) | sign_bit; |
| 113 | + unsigned shift_e = x_e % 3; |
| 114 | + |
| 115 | + // Set x_m = 2^(x_e % 3) * (1.mantissa) |
| 116 | + uint64_t x_m = xd_bits.get_mantissa(); |
| 117 | + // Use the leading 4 bits for look up table |
| 118 | + unsigned idx = static_cast<unsigned>(x_m >> (DoubleBits::FRACTION_LEN - 4)); |
| 119 | + |
| 120 | + x_m |= static_cast<uint64_t>(DoubleBits::EXP_BIAS) |
| 121 | + << DoubleBits::FRACTION_LEN; |
| 122 | + |
| 123 | + double x_reduced = DoubleBits(x_m).get_val(); |
| 124 | + double dx = x_reduced - 1.0; |
| 125 | + |
| 126 | + double dx_sq = dx * dx; |
| 127 | + double c0 = fputil::multiply_add(dx, COEFFS[idx][0], 1.0); |
| 128 | + double c1 = fputil::multiply_add(dx, COEFFS[idx][2], COEFFS[idx][1]); |
| 129 | + double c2 = fputil::multiply_add(dx, COEFFS[idx][4], COEFFS[idx][3]); |
| 130 | + double c3 = fputil::multiply_add(dx, COEFFS[idx][6], COEFFS[idx][5]); |
| 131 | + |
| 132 | + double dx_4 = dx_sq * dx_sq; |
| 133 | + double p0 = fputil::multiply_add(dx_sq, c1, c0); |
| 134 | + double p1 = fputil::multiply_add(dx_sq, c3, c2); |
| 135 | + |
| 136 | + double r = fputil::multiply_add(dx_4, p1, p0) * CBRT2[shift_e]; |
| 137 | + |
| 138 | + uint64_t r_m = DoubleBits(r).get_mantissa(); |
| 139 | + // Check if the output is exact. To be exact, the smallest 1-bit of the |
| 140 | + // output has to be at least 2^-7 or higher. So we check the lowest 44 bits |
| 141 | + // to see if they are within 2^(-52 + 3) errors from all zeros, then the |
| 142 | + // result cube root is exact. |
| 143 | + if (LIBC_UNLIKELY(((r_m + 8) & 0xfffffffffff) <= 16)) { |
| 144 | + if ((r_m & 0xfffffffffff) <= 8) |
| 145 | + r_m &= 0xffff'ffff'ffff'ffe0; |
| 146 | + else |
| 147 | + r_m = (r_m & 0xffff'ffff'ffff'ffe0) + 0x20; |
| 148 | + fputil::clear_except_if_required(FE_INEXACT); |
| 149 | + } |
| 150 | + // Adjust exponent and sign. |
| 151 | + uint64_t r_bits = |
| 152 | + r_m | (static_cast<uint64_t>(out_e) << DoubleBits::FRACTION_LEN); |
| 153 | + |
| 154 | + return static_cast<float>(DoubleBits(r_bits).get_val()); |
| 155 | +} |
| 156 | + |
| 157 | +} // namespace math |
| 158 | + |
| 159 | +} // namespace LIBC_NAMESPACE_DECL |
| 160 | + |
| 161 | +#endif // LIBC_SRC___SUPPORT_MATH_CBRTF_H |
0 commit comments