@@ -2455,6 +2455,9 @@ Given that csum >= 1.0, we have:
24552455 lo**2 <= 2**-54 < 2**-53 == 1/2*ulp(1.0) <= ulp(csum)/2
24562456Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum.
24572457
2458+ To minimize loss of information during the accumulation of fractional
2459+ values, the lo**2 term has a separate accumulator.
2460+
24582461The square root differential correction is needed because a
24592462correctly rounded square root of a correctly rounded sum of
24602463squares can still be off by as much as one ulp.
@@ -2475,15 +2478,16 @@ Borges' ALGORITHM 4 and 5.
24752478
247624791. Veltkamp-Dekker splitting: http://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
247724802. Compensated summation: http://www.ti3.tu-harburg.de/paper/rump/Ru08b.pdf
2478- 3. Square root diffential correction: https://arxiv.org/pdf/1904.09481.pdf
2481+ 3. Square root differential correction: https://arxiv.org/pdf/1904.09481.pdf
2482+ 4. https://www.wolframalpha.com/input/?i=Maclaurin+series+sqrt%28h**2+%2B+x%29+at+x%3D0
24792483
24802484*/
24812485
24822486static inline double
24832487vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
24842488{
24852489 const double T27 = 134217729.0 ; /* ldexp(1.0, 27)+1.0) */
2486- double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2490+ double x , csum = 1.0 , oldcsum , frac = 0.0 , frac_lo = 0.0 , scale ;
24872491 double t , hi , lo , h ;
24882492 int max_e ;
24892493 Py_ssize_t i ;
@@ -2528,8 +2532,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25282532 frac += (oldcsum - csum ) + x ;
25292533
25302534 assert (csum + lo * lo == csum );
2531- frac += lo * lo ;
2535+ frac_lo += lo * lo ;
25322536 }
2537+ frac += frac_lo ;
25332538 h = sqrt (csum - 1.0 + frac );
25342539
25352540 x = h ;
0 commit comments