@@ -101,6 +101,65 @@ inline constexpr Int128 prod_hi<Int128, UInt128>(Int128 x, UInt128 y) {
101101 return static_cast <Int128>(prod - negative_part);
102102}
103103
104+ // Newton-Raphson first order step to improve accuracy of the result.
105+ // For the initial approximation r0 ~ 1/sqrt(x), let
106+ // h = r0^2 * x - 1
107+ // be its scaled error. Then the first-order Newton-Raphson iteration is:
108+ // r1 = r0 - r0 * h / 2
109+ // which has error bounded by:
110+ // |r1 - 1/sqrt(x)| < h^2 / 2.
111+ LIBC_INLINE uint64_t rsqrt_newton_raphson (uint64_t m, uint64_t r) {
112+ uint64_t r2 = prod_hi (r, r);
113+ // h = r0^2*x - 1.
114+ int64_t h = static_cast <int64_t >(prod_hi (m, r2) + r2);
115+ // hr = r * h / 2
116+ int64_t hr = prod_hi (h, static_cast <int64_t >(r >> 1 ));
117+ return r - hr;
118+ }
119+
120+ #ifdef LIBC_MATH_HAS_SMALL_TABLES
121+ // Degree-12 minimax polynomials for 1/sqrt(x) on [1, 2].
122+ constexpr uint32_t RSQRT_COEFFS[12 ] = {
123+ 0xb5947a4a , 0x2d651e32 , 0x9ad50532 , 0x2d28d093 , 0x0d8be653 , 0x04239014 ,
124+ 0x01492449 , 0x0066ff7d , 0x001e74a1 , 0x000984cc , 0x00049abc , 0x00018340 ,
125+ };
126+
127+ LIBC_INLINE uint64_t rsqrt_approx (uint64_t m) {
128+ int64_t x = static_cast <uint64_t >(m) ^ (uint64_t (1 ) << 63 );
129+ int64_t x_26 = x >> 2 ;
130+ int64_t z = x >> 31 ;
131+
132+ if (LIBC_UNLIKELY (z <= -4294967296 ))
133+ return ~(m >> 1 );
134+
135+ uint64_t x2 = static_cast <uint64_t >(z) * static_cast <uint64_t >(z);
136+ uint64_t x2_26 = x2 >> 5 ;
137+ x2 >>= 32 ;
138+ // Calculate the odd part of the polynomial using Horner's method.
139+ uint64_t c0 = RSQRT_COEFFS[8 ] + ((x2 * RSQRT_COEFFS[10 ]) >> 32 );
140+ uint64_t c1 = RSQRT_COEFFS[6 ] + ((x2 * c0) >> 32 );
141+ uint64_t c2 = RSQRT_COEFFS[4 ] + ((x2 * c1) >> 32 );
142+ uint64_t c3 = RSQRT_COEFFS[2 ] + ((x2 * c2) >> 32 );
143+ uint64_t c4 = RSQRT_COEFFS[0 ] + ((x2 * c3) >> 32 );
144+ uint64_t odd =
145+ static_cast <uint64_t >((x >> 34 ) * static_cast <int64_t >(c4 >> 3 )) + x_26;
146+ // Calculate the even part of the polynomial using Horner's method.
147+ uint64_t d0 = RSQRT_COEFFS[9 ] + ((x2 * RSQRT_COEFFS[11 ]) >> 32 );
148+ uint64_t d1 = RSQRT_COEFFS[7 ] + ((x2 * d0) >> 32 );
149+ uint64_t d2 = RSQRT_COEFFS[5 ] + ((x2 * d1) >> 32 );
150+ uint64_t d3 = RSQRT_COEFFS[3 ] + ((x2 * d2) >> 32 );
151+ uint64_t d4 = RSQRT_COEFFS[1 ] + ((x2 * d3) >> 32 );
152+ uint64_t even = 0xd105eb806655d608ul + ((x2 * d4) >> 6 ) + x2_26;
153+
154+ uint64_t r = even - odd; // error < 1.5e-10
155+ // Newton-Raphson first order step to improve accuracy of the result to almost
156+ // 64 bits.
157+ return rsqrt_newton_raphson (m, r);
158+ }
159+
160+ #else
161+ // Cubic minimax polynomials for 1/sqrt(x) on [1 + k/64, 1 + (k + 1)/64]
162+ // for k = 0..63.
104163constexpr uint32_t RSQRT_COEFFS[64 ][4 ] = {
105164 {0xffffffff , 0xfffff780 , 0xbff55815 , 0x9bb5b6e7 },
106165 {0xfc0bd889 , 0xfa1d6e7d , 0xb8a95a89 , 0x938bf8f0 },
@@ -196,26 +255,15 @@ LIBC_INLINE uint64_t rsqrt_approx(uint64_t m) {
196255 uint64_t ro = d * ((c1 + ((d2 * c3) >> 19 )) >> 26 ) >>
197256 6 ; // odd part of the polynomial (negative)
198257 uint64_t r = re - ro; // maximal error < 1.55e-10 and it is less than 2^-32
199-
200258 // Newton-Raphson first order step to improve accuracy of the result to almost
201- // 64 bits:
202- // For the initial approximation r0 ~ 1/sqrt(x), let
203- // h = r0^2 * x - 1
204- // be its scaled error. Then the first-order Newton-Raphson iteration is:
205- // r1 = r0 - r0 * h / 2
206- // which has error bounded by:
207- // |r1 - 1/sqrt(x)| < h^2 / 2.
208- uint64_t r2 = prod_hi (r, r);
209- // h = r0^2*x - 1.
210- int64_t h = static_cast <int64_t >(prod_hi (m, r2) + r2);
211- // hr = r * h / 2
212- int64_t hr = prod_hi (h, static_cast <int64_t >(r >> 1 ));
213- r -= hr;
259+ // 64 bits.
260+ r = rsqrt_newton_raphson (m, r);
214261 // Adjust in the unlucky case x~1;
215262 if (LIBC_UNLIKELY (!r))
216263 --r;
217264 return r;
218265}
266+ #endif // LIBC_MATH_HAS_SMALL_TABLES
219267
220268} // anonymous namespace
221269
@@ -254,7 +302,7 @@ LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) {
254302 // Now x is subnormal or x = +0.
255303
256304 // x is +0.
257- if (x_u == 0 )
305+ if (x_frac == 0 )
258306 return x;
259307
260308 // Normalize subnormal inputs.
0 commit comments