Remove the old 512-bit sqrt functions, replace them with the new, optimized ones

duncancmt · duncancmt · commit aa5b92dcc032 · 2026-02-22T16:54:04.000+01:00
diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
@@ -1689,164 +1689,7 @@ library Lib512MathArithmetic {
         return omodAlt(r, y, r);
     }
 
-    // hi ≈ x · y / 2²⁵⁶ (±1)
-    function _inaccurateMulHi(uint256 x, uint256 y) private pure returns (uint256 hi) {
-        assembly ("memory-safe") {
-            hi := sub(mulmod(x, y, not(0x00)), mul(x, y))
-        }
-    }
-
-    // gas benchmark 2025/09/20: ~1425 gas
     function _sqrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
-        /// Our general approach here is to compute the inverse of the square root of the argument
-        /// using Newton-Raphson iterations. Then we combine (multiply) this inverse square root
-        /// approximation with the argument to approximate the square root of the argument. After
-        /// that, a final fixup step is applied to get the exact result. We compute the inverse of
-        /// the square root rather than the square root directly because then our Newton-Raphson
-        /// iteration can avoid the extremely expensive 512-bit division subroutine.
-        unchecked {
-            /// First, we normalize `x` by separating it into a mantissa and exponent. We use
-            /// even-exponent normalization.
-
-            // `e` is half the exponent of `x`
-            // e    = ⌊bitlength(x)/2⌋
-            // invE = 256 - e
-            uint256 invE = (x_hi.clz() + 1) >> 1; // invE ∈ [0, 128]
-
-            // Extract mantissa M by shifting x right by 2·e - 255 bits
-            // `M` is the mantissa of `x` as a Q1.255; M ∈ [½, 2)
-            (, uint256 M) = _shr(x_hi, x_lo, 257 - (invE << 1)); // scale: 2⁽²⁵⁵⁻²ᵉ⁾
-
-            /// Pick an initial estimate (seed) for Y using a lookup table. Even-exponent
-            /// normalization means our mantissa is geometrically symmetric around 1, leading to 16
-            /// buckets on the low side and 32 buckets on the high side.
-            // `Y` _ultimately_ approximates the inverse square root of fixnum `M` as a
-            // Q3.253. However, as a gas optimization, the number of fractional bits in `Y` rises
-            // through the steps, giving an inhomogeneous fixed-point representation. Y ≈∈ [√½, √2]
-            uint256 Y; // scale: 2⁽²⁵³⁺ᵉ⁾
-            uint256 Mbucket;
-            assembly ("memory-safe") {
-                // Extract the upper 6 bits of `M` to be used as a table index. `M >> 250 < 16` is
-                // invalid (that would imply M<½), so our lookup table only needs to handle only 16
-                // through 63.
-                Mbucket := shr(0xfa, M)
-                // We can't fit 48 seeds into a single word, so we split the table in 2 and use `c`
-                // to select which table we index.
-                let c := lt(0x27, Mbucket)
-
-                // Each entry is 10 bits and the entries are ordered from lowest `i` to highest. The
-                // seed is the value for `Y` for the midpoint of the bucket, rounded to 10
-                // significant bits. That is, Y ≈ 1/√(2·M_mid), as a Q247.9. The 2 comes from the
-                // half-scale difference between Y and √M. The optimality of this choice was
-                // verified by fuzzing.
-                let table_hi := 0x71dc26f1b76c9ad6a5a46819c661946418c621856057e5ed775d1715b96b
-                let table_lo := 0xb26b4a8690a027198e559263e8ce2887e15832047f1f47b5e677dd974dcd
-                let table := xor(table_lo, mul(xor(table_hi, table_lo), c))
-
-                // Index the table to obtain the initial seed of `Y`.
-                let shift := add(0x186, mul(0x0a, sub(mul(0x18, c), Mbucket)))
-                // We begin the Newton-Raphson iterations with `Y` in Q247.9 format.
-                Y := and(0x3ff, shr(shift, table))
-
-                // The worst-case seed for `Y` occurs when `Mbucket = 16`. For monotone quadratic
-                // convergence, we desire that 1/√3 < Y·√M < √(5/3). At the boundaries (worst case)
-                // of the `Mbucket = 16` range, we are 0.407351 (41.3680%) from the lower bound and
-                // 0.275987 (27.1906%) from the higher bound.
-            }
-
-            /// Perform 5 Newton-Raphson iterations. 5 is enough iterations for sufficient
-            /// convergence that our final fixup step produces an exact result.
-            // The Newton-Raphson iteration for 1/√M is:
-            //     Y ≈ Y · (3 - M · Y²) / 2
-            // The implementation of this iteration is deliberately imprecise. No matter how many
-            // times you run it, you won't converge `Y` on the closest Q3.253 to √M. However, this
-            // is acceptable because the cleanup step applied after the final call is very tolerant
-            // of error in the low bits of `Y`.
-
-            // `M` is Q1.255
-            // `Y` is Q247.9
-            {
-                uint256 Y2 = Y * Y;                    // scale: 2¹⁸
-                // Because `M` is Q1.255, multiplying `Y2` by `M` and taking the high word
-                // implicitly divides `MY2` by 2. We move the division by 2 inside the subtraction
-                // from 3 by adjusting the minuend.
-                uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁸
-                uint256 T = 1.5 * 2 ** 18 - MY2;       // scale: 2¹⁸
-                Y *= T;                                // scale: 2²⁷
-            }
-            // `Y` is Q229.27
-            {
-                uint256 Y2 = Y * Y;                    // scale: 2⁵⁴
-                uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2⁵⁴
-                uint256 T = 1.5 * 2 ** 54 - MY2;       // scale: 2⁵⁴
-                Y *= T;                                // scale: 2⁸¹
-            }
-            // `Y` is Q175.81
-            {
-                uint256 Y2 = Y * Y;                    // scale: 2¹⁶²
-                uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁶²
-                uint256 T = 1.5 * 2 ** 162 - MY2;      // scale: 2¹⁶²
-                Y = Y * T >> 116;                      // scale: 2¹²⁷
-            }
-            // `Y` is Q129.127
-            if (invE < 95 - Mbucket) {
-                // Generally speaking, for relatively smaller `e` (lower values of `x`) and for
-                // relatively larger `M`, we can skip the 5th N-R iteration. The constant `95` is
-                // derived by extensive fuzzing. Attempting a higher-order approximation of the
-                // relationship between `M` and `invE` consumes, on average, more gas. When this
-                // branch is not taken, the correct bits that this iteration would obtain are
-                // shifted away during the denormalization step. This branch is net gas-optimizing.
-                uint256 Y2 = Y * Y;                    // scale: 2²⁵⁴
-                uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
-                uint256 T = 1.5 * 2 ** 254 - MY2;      // scale: 2²⁵⁴
-                Y = _inaccurateMulHi(Y << 2, T);       // scale: 2¹²⁷
-            }
-            // `Y` is Q129.127
-            {
-                uint256 Y2 = Y * Y;                    // scale: 2²⁵⁴
-                uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
-                uint256 T = 1.5 * 2 ** 254 - MY2;      // scale: 2²⁵⁴
-                Y = _inaccurateMulHi(Y << 128, T);     // scale: 2²⁵³
-            }
-            // `Y` is Q3.253
-
-            /// When we combine `Y` with `M` to form our approximation of the square root, we have
-            /// to un-normalize by the half-scale value. This is where even-exponent normalization
-            /// comes in because the half-scale is integral.
-            ///     M   = ⌊x · 2⁽²⁵⁵⁻²ᵉ⁾⌋
-            ///     Y   ≈ 2²⁵³ / √(M / 2²⁵⁵)
-            ///     Y   ≈ 2³⁸¹ / √(2·M)
-            ///     M·Y ≈ 2³⁸¹ · √(M/2)
-            ///     M·Y ≈ 2⁽⁵⁰⁸⁻ᵉ⁾ · √x
-            ///     r0  ≈ M·Y / 2⁽⁵⁰⁸⁻ᵉ⁾ ≈ ⌊√x⌋
-            // We shift right by `508 - e` to account for both the Q3.253 scaling and
-            // denormalization. We don't care about accuracy in the low bits of `r0`, so we can cut
-            // some corners.
-            (, uint256 r0) = _shr(_inaccurateMulHi(M, Y), 0, 252 + invE);
-
-            /// `r0` is only an approximation of √x, so we perform a single Babylonian step to fully
-            /// converge on ⌊√x⌋ or ⌈√x⌉.  The Babylonian step is:
-            ///     r = ⌊(r0 + ⌊x/r0⌋) / 2⌋
-            // Rather than use the more-expensive division routine that returns a 512-bit result,
-            // because the value the upper word of the quotient can take is highly constrained, we
-            // can compute the quotient mod 2²⁵⁶ and recover the high word separately. Although
-            // `_div` does an expensive Newton-Raphson-Hensel modular inversion:
-            //     ⌊x/r0⌋ ≡ ⌊x/2ⁿ⌋·⌊r0/2ⁿ⌋⁻¹ mod 2²⁵⁶ (for r % 2⁽ⁿ⁺¹⁾ = 2ⁿ)
-            // and we already have a pretty good estimate for r0⁻¹, namely `Y`, refining `Y` into
-            // the appropriate inverse requires a series of 768-bit multiplications that take more
-            // gas.
-            uint256 q_lo = _div(x_hi, x_lo, r0);
-            uint256 q_hi = (r0 <= x_hi).toUint();
-            (uint256 s_hi, uint256 s_lo) = _add(q_hi, q_lo, r0);
-            // `oflo` here is either 0 or 1. When `oflo == 1`, `r == 0`, and the correct value for
-            // `r` is `type(uint256).max`.
-            uint256 oflo;
-            (oflo, r) = _shr256(s_hi, s_lo, 1);
-            r -= oflo; // underflow is desired
-        }
-    }
-
-    function _sqrtAlt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
         unchecked {
             /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm
             /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
@@ -1931,23 +1774,7 @@ library Lib512MathArithmetic {
             return x_lo.sqrt();
         }
 
-        uint256 r = _sqrt(x_hi, x_lo);
-
-        // Because the Babylonian step can give ⌈√x⌉ if x+1 is a perfect square, we have to
-        // check whether we've overstepped by 1 and clamp as appropriate. ref:
-        // https://en.wikipedia.org/wiki/Integer_square_root#Using_only_integer_division
-        (uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
-        return r.unsafeDec(_gt(r2_hi, r2_lo, x_hi, x_lo));
-    }
-
-    function sqrtAlt(uint512 x) internal pure returns (uint256) {
-        (uint256 x_hi, uint256 x_lo) = x.into();
-
-        if (x_hi == 0) {
-            return x_lo.sqrt();
-        }
-
-        return _sqrtAlt(x_hi, x_lo);
+        return _sqrt(x_hi, x_lo);
     }
 
     function osqrtUp(uint512 r, uint512 x) internal pure returns (uint512) {
@@ -1959,23 +1786,6 @@ library Lib512MathArithmetic {
 
         uint256 r_lo = _sqrt(x_hi, x_lo);
 
-        // The Babylonian step can give ⌈√x⌉ if x+1 is a perfect square. This is
-        // fine. If the Babylonian step gave ⌊√x⌋ ≠ √x, we have to round up.
-        (uint256 r2_hi, uint256 r2_lo) = _mul(r_lo, r_lo);
-        uint256 r_hi;
-        (r_hi, r_lo) = _add(0, r_lo, _gt(x_hi, x_lo, r2_hi, r2_lo).toUint());
-        return r.from(r_hi, r_lo);
-    }
-
-    function osqrtUpAlt(uint512 r, uint512 x) internal pure returns (uint512) {
-        (uint256 x_hi, uint256 x_lo) = x.into();
-
-        if (x_hi == 0) {
-            return r.from(0, x_lo.sqrtUp());
-        }
-
-        uint256 r_lo = _sqrtAlt(x_hi, x_lo);
-
         (uint256 r2_hi, uint256 r2_lo) = _mul(r_lo, r_lo);
         uint256 r_hi;
         (r_hi, r_lo) = _add(0, r_lo, _gt(x_hi, x_lo, r2_hi, r2_lo).toUint());
@@ -1986,10 +1796,6 @@ library Lib512MathArithmetic {
         return osqrtUp(r, r);
     }
 
-    function isqrtUpAlt(uint512 r) internal pure returns (uint512) {
-        return osqrtUpAlt(r, r);
-    }
-
     function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) {
         (uint256 x_hi, uint256 x_lo) = x.into();
         (uint256 r_hi, uint256 r_lo) = _shr(x_hi, x_lo, s);
diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
@@ -239,26 +239,6 @@ contract Lib512MathTest is Test {
         }
     }
 
-    function test512Math_sqrtAlt(uint256 x_hi, uint256 x_lo) external pure {
-        uint512 x = alloc().from(x_hi, x_lo);
-        uint256 r = x.sqrtAlt();
-
-        (uint256 r2_lo, uint256 r2_hi) = SlowMath.fullMul(r, r);
-        assertTrue((r2_hi < x_hi) || (r2_hi == x_hi && r2_lo <= x_lo), "sqrtAlt too high");
-
-        if (r == type(uint256).max) {
-            assertTrue(
-                x_hi > 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe
-                    || (x_hi == 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe && x_lo != 0),
-                "sqrtAlt too low (overflow)"
-            );
-        } else {
-            r++;
-            (r2_lo, r2_hi) = SlowMath.fullMul(r, r);
-            assertTrue((r2_hi > x_hi) || (r2_hi == x_hi && r2_lo > x_lo), "sqrtAlt too low");
-        }
-    }
-
     function test512Math_divUpAlt(uint256 x_hi, uint256 x_lo, uint256 y_hi, uint256 y_lo) external view {
         vm.assume(y_hi != 0);
 
@@ -442,28 +422,6 @@ contract Lib512MathTest is Test {
         }
     }
 
-    function test512Math_osqrtUpAlt(uint256 x_hi, uint256 x_lo) external pure {
-        uint512 x = alloc().from(x_hi, x_lo);
-        (uint256 r_hi, uint256 r_lo) = alloc().osqrtUpAlt(x).into();
-
-        if (r_hi == 0 && r_lo == 0) {
-            assertTrue(x_hi == 0 && x_lo == 0, "sqrtUpAlt of nonzero is zero");
-        } else if (r_hi != 0) {
-            assertTrue(r_hi == 1 && r_lo == 0, "overflow result must be exactly 2^256");
-            (uint256 r_dec2_lo, uint256 r_dec2_hi) = SlowMath.fullMul(type(uint256).max, type(uint256).max);
-            assertTrue((r_dec2_hi < x_hi) || (r_dec2_hi == x_hi && r_dec2_lo < x_lo), "sqrtUpAlt too high");
-        } else {
-            (uint256 r2_lo, uint256 r2_hi) = SlowMath.fullMul(r_lo, r_lo);
-            assertTrue((r2_hi > x_hi) || (r2_hi == x_hi && r2_lo >= x_lo), "sqrtUpAlt too low");
-
-            if (r_lo != 1) {
-                uint256 r_dec_lo = r_lo - 1;
-                (r2_lo, r2_hi) = SlowMath.fullMul(r_dec_lo, r_dec_lo);
-                assertTrue((r2_hi < x_hi) || (r2_hi == x_hi && r2_lo < x_lo), "sqrtUpAlt too high");
-            }
-        }
-    }
-
     function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure {
         s = bound(s, 0, 512);