@@ -2447,9 +2447,8 @@ Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum.
24472447To minimize loss of information during the accumulation of fractional
24482448values, each term has a separate accumulator. This also breaks up
24492449sequential dependencies in the inner loop so the CPU can maximize
2450- floating point throughput. [4] On a 2.6 GHz Haswell, adding one
2451- dimension has an incremental cost of only 5ns -- for example when
2452- moving from hypot(x,y) to hypot(x,y,z).
2450+ floating point throughput. [4] On an Apple M1 Max, hypot(*vec)
2451+ takes only 3.33 µsec when len(vec) == 1000.
24532452
24542453The square root differential correction is needed because a
24552454correctly rounded square root of a correctly rounded sum of
@@ -2473,7 +2472,7 @@ step is exact. The Neumaier summation computes as if in doubled
24732472precision (106 bits) and has the advantage that its input squares
24742473are non-negative so that the condition number of the sum is one.
24752474The square root with a differential correction is likewise computed
2476- as if in double precision.
2475+ as if in doubled precision.
24772476
24782477For n <= 1000, prior to the final addition that rounds the overall
24792478result, the internal accuracy of "h" together with its correction of
@@ -2514,12 +2513,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25142513 }
25152514 frexp (max , & max_e );
25162515 if (max_e < -1023 ) {
2517- /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
2518- So we first perform lossless scaling from subnormals back to normals,
2519- then recurse back to vector_norm(), and then finally undo the scaling.
2520- */
2516+ /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. */
25212517 for (i = 0 ; i < n ; i ++ ) {
2522- vec [i ] /= DBL_MIN ;
2518+ vec [i ] /= DBL_MIN ; // convert subnormals to normals
25232519 }
25242520 return DBL_MIN * vector_norm (n , vec , max / DBL_MIN , found_nan );
25252521 }
@@ -2529,17 +2525,14 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25292525 for (i = 0 ; i < n ; i ++ ) {
25302526 x = vec [i ];
25312527 assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2532-
2533- x *= scale ;
2528+ x *= scale ; // lossless scaling
25342529 assert (fabs (x ) < 1.0 );
2535-
2536- pr = dl_mul (x , x );
2530+ pr = dl_mul (x , x ); // lossless squaring
25372531 assert (pr .hi <= 1.0 );
2538-
2539- sm = dl_fast_sum (csum , pr .hi );
2532+ sm = dl_fast_sum (csum , pr .hi ); // lossless addition
25402533 csum = sm .hi ;
2541- frac1 += pr .lo ;
2542- frac2 += sm .lo ;
2534+ frac1 += pr .lo ; // lossy addition
2535+ frac2 += sm .lo ; // lossy addition
25432536 }
25442537 h = sqrt (csum - 1.0 + (frac1 + frac2 ));
25452538 pr = dl_mul (- h , h );
@@ -2548,7 +2541,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25482541 frac1 += pr .lo ;
25492542 frac2 += sm .lo ;
25502543 x = csum - 1.0 + (frac1 + frac2 );
2551- return (h + x / (2.0 * h )) / scale ;
2544+ h += x / (2.0 * h ); // differential correction
2545+ return h / scale ;
25522546}
25532547
25542548#define NUM_STACK_ELEMS 16
0 commit comments