@@ -52,15 +52,28 @@ template <int KIND> class Norm2Accumulator {
5252 const Constant<T> &array, const Constant<T> &maxAbs, Rounding rounding)
5353 : array_{array}, maxAbs_{maxAbs}, rounding_{rounding} {};
5454 void operator ()(Scalar<T> &element, const ConstantSubscripts &at) {
55- // Kahan summation of scaled elements
55+ // Kahan summation of scaled elements:
56+ // Naively,
57+ // NORM2(A(:)) = SQRT(SUM(A(:)**2))
58+ // For any T > 0, we have mathematically
59+ // SQRT(SUM(A(:)**2))
60+ // = SQRT(T**2 * (SUM(A(:)**2) / T**2))
61+ // = SQRT(T**2 * SUM(A(:)**2 / T**2))
62+ // = SQRT(T**2 * SUM((A(:)/T)**2))
63+ // = SQRT(T**2) * SQRT(SUM((A(:)/T)**2))
64+ // = T * SQRT(SUM((A(:)/T)**2))
65+ // By letting T = MAXVAL(ABS(A)), we ensure that
66+ // ALL(ABS(A(:)/T) <= 1), so ALL((A(:)/T)**2 <= 1), and the SUM will
67+ // not overflow unless absolutely necessary.
5668 auto scale{maxAbs_.At (maxAbsAt_)};
5769 if (scale.IsZero ()) {
58- // If maxAbs is zero, so are all elements, and result
70+ // Maximum value is zero, and so will the result be.
71+ // Avoid division by zero below.
5972 element = scale;
6073 } else {
6174 auto item{array_.At (at)};
6275 auto scaled{item.Divide (scale).value };
63- auto square{item .Multiply (scaled).value };
76+ auto square{scaled .Multiply (scaled).value };
6477 auto next{square.Add (correction_, rounding_)};
6578 overflow_ |= next.flags .test (RealFlag::Overflow);
6679 auto sum{element.Add (next.value , rounding_)};
@@ -73,13 +86,16 @@ template <int KIND> class Norm2Accumulator {
7386 }
7487 bool overflow () const { return overflow_; }
7588 void Done (Scalar<T> &result) {
89+ // result+correction == SUM((data(:)/maxAbs)**2)
90+ // result = maxAbs * SQRT(result+correction)
7691 auto corrected{result.Add (correction_, rounding_)};
7792 overflow_ |= corrected.flags .test (RealFlag::Overflow);
7893 correction_ = Scalar<T>{};
79- auto rescaled{corrected.value .Multiply (maxAbs_.At (maxAbsAt_))};
94+ auto root{corrected.value .SQRT ().value };
95+ auto product{root.Multiply (maxAbs_.At (maxAbsAt_))};
8096 maxAbs_.IncrementSubscripts (maxAbsAt_);
81- overflow_ |= rescaled .flags .test (RealFlag::Overflow);
82- result = rescaled. value . SQRT () .value ;
97+ overflow_ |= product .flags .test (RealFlag::Overflow);
98+ result = product .value ;
8399 }
84100
85101private:
0 commit comments