@@ -2498,7 +2498,7 @@ verified for 1 billion random inputs with n=5. [7]
24982498static inline double
24992499vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
25002500{
2501- double x , h , scale , oldcsum , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 ;
2501+ double x , h , scale , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 ;
25022502 DoubleLength pr , sm ;
25032503 int max_e ;
25042504 Py_ssize_t i ;
@@ -2513,49 +2513,42 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
25132513 return max ;
25142514 }
25152515 frexp (max , & max_e );
2516- if (max_e >= -1023 ) {
2517- scale = ldexp (1.0 , - max_e );
2518- assert (max * scale >= 0.5 );
2519- assert (max * scale < 1.0 );
2516+ if (max_e < -1023 ) {
2517+ /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
2518+ So we first perform lossless scaling from subnormals back to normals,
2519+ then recurse back to vector_norm(), and then finally undo the scaling.
2520+ */
25202521 for (i = 0 ; i < n ; i ++ ) {
2521- x = vec [i ];
2522- assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2522+ vec [i ] /= DBL_MIN ;
2523+ }
2524+ return DBL_MIN * vector_norm (n , vec , max / DBL_MIN , found_nan );
2525+ }
2526+ scale = ldexp (1.0 , - max_e );
2527+ assert (max * scale >= 0.5 );
2528+ assert (max * scale < 1.0 );
2529+ for (i = 0 ; i < n ; i ++ ) {
2530+ x = vec [i ];
2531+ assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
25232532
2524- x *= scale ;
2525- assert (fabs (x ) < 1.0 );
2533+ x *= scale ;
2534+ assert (fabs (x ) < 1.0 );
25262535
2527- pr = dl_mul (x , x );
2528- assert (pr .hi <= 1.0 );
2536+ pr = dl_mul (x , x );
2537+ assert (pr .hi <= 1.0 );
25292538
2530- sm = dl_fast_sum (csum , pr .hi );
2531- csum = sm .hi ;
2532- frac1 += pr .lo ;
2533- frac2 += sm .lo ;
2534- }
2535- h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2536- pr = dl_mul (- h , h );
25372539 sm = dl_fast_sum (csum , pr .hi );
25382540 csum = sm .hi ;
25392541 frac1 += pr .lo ;
25402542 frac2 += sm .lo ;
2541- x = csum - 1.0 + (frac1 + frac2 );
2542- return (h + x / (2.0 * h )) / scale ;
25432543 }
2544- /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2545- So instead of multiplying by a scale, we just divide by *max*.
2546- */
2547- for (i = 0 ; i < n ; i ++ ) {
2548- x = vec [i ];
2549- assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2550- x /= max ;
2551- x = x * x ;
2552- assert (x <= 1.0 );
2553- assert (fabs (csum ) >= fabs (x ));
2554- oldcsum = csum ;
2555- csum += x ;
2556- frac1 += (oldcsum - csum ) + x ;
2557- }
2558- return max * sqrt (csum - 1.0 + frac1 );
2544+ h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2545+ pr = dl_mul (- h , h );
2546+ sm = dl_fast_sum (csum , pr .hi );
2547+ csum = sm .hi ;
2548+ frac1 += pr .lo ;
2549+ frac2 += sm .lo ;
2550+ x = csum - 1.0 + (frac1 + frac2 );
2551+ return (h + x / (2.0 * h )) / scale ;
25592552}
25602553
25612554#define NUM_STACK_ELEMS 16
0 commit comments