@@ -1689,164 +1689,7 @@ library Lib512MathArithmetic {
16891689 return omodAlt (r, y, r);
16901690 }
16911691
1692- // hi ≈ x · y / 2²⁵⁶ (±1)
1693- function _inaccurateMulHi (uint256 x , uint256 y ) private pure returns (uint256 hi ) {
1694- assembly ("memory-safe" ) {
1695- hi := sub (mulmod (x, y, not (0x00 )), mul (x, y))
1696- }
1697- }
1698-
1699- // gas benchmark 2025/09/20: ~1425 gas
17001692 function _sqrt (uint256 x_hi , uint256 x_lo ) private pure returns (uint256 r ) {
1701- /// Our general approach here is to compute the inverse of the square root of the argument
1702- /// using Newton-Raphson iterations. Then we combine (multiply) this inverse square root
1703- /// approximation with the argument to approximate the square root of the argument. After
1704- /// that, a final fixup step is applied to get the exact result. We compute the inverse of
1705- /// the square root rather than the square root directly because then our Newton-Raphson
1706- /// iteration can avoid the extremely expensive 512-bit division subroutine.
1707- unchecked {
1708- /// First, we normalize `x` by separating it into a mantissa and exponent. We use
1709- /// even-exponent normalization.
1710-
1711- // `e` is half the exponent of `x`
1712- // e = ⌊bitlength(x)/2⌋
1713- // invE = 256 - e
1714- uint256 invE = (x_hi.clz () + 1 ) >> 1 ; // invE ∈ [0, 128]
1715-
1716- // Extract mantissa M by shifting x right by 2·e - 255 bits
1717- // `M` is the mantissa of `x` as a Q1.255; M ∈ [½, 2)
1718- (, uint256 M ) = _shr (x_hi, x_lo, 257 - (invE << 1 )); // scale: 2⁽²⁵⁵⁻²ᵉ⁾
1719-
1720- /// Pick an initial estimate (seed) for Y using a lookup table. Even-exponent
1721- /// normalization means our mantissa is geometrically symmetric around 1, leading to 16
1722- /// buckets on the low side and 32 buckets on the high side.
1723- // `Y` _ultimately_ approximates the inverse square root of fixnum `M` as a
1724- // Q3.253. However, as a gas optimization, the number of fractional bits in `Y` rises
1725- // through the steps, giving an inhomogeneous fixed-point representation. Y ≈∈ [√½, √2]
1726- uint256 Y; // scale: 2⁽²⁵³⁺ᵉ⁾
1727- uint256 Mbucket;
1728- assembly ("memory-safe" ) {
1729- // Extract the upper 6 bits of `M` to be used as a table index. `M >> 250 < 16` is
1730- // invalid (that would imply M<½), so our lookup table only needs to handle only 16
1731- // through 63.
1732- Mbucket := shr (0xfa , M)
1733- // We can't fit 48 seeds into a single word, so we split the table in 2 and use `c`
1734- // to select which table we index.
1735- let c := lt (0x27 , Mbucket)
1736-
1737- // Each entry is 10 bits and the entries are ordered from lowest `i` to highest. The
1738- // seed is the value for `Y` for the midpoint of the bucket, rounded to 10
1739- // significant bits. That is, Y ≈ 1/√(2·M_mid), as a Q247.9. The 2 comes from the
1740- // half-scale difference between Y and √M. The optimality of this choice was
1741- // verified by fuzzing.
1742- let table_hi := 0x71dc26f1b76c9ad6a5a46819c661946418c621856057e5ed775d1715b96b
1743- let table_lo := 0xb26b4a8690a027198e559263e8ce2887e15832047f1f47b5e677dd974dcd
1744- let table := xor (table_lo, mul (xor (table_hi, table_lo), c))
1745-
1746- // Index the table to obtain the initial seed of `Y`.
1747- let shift := add (0x186 , mul (0x0a , sub (mul (0x18 , c), Mbucket)))
1748- // We begin the Newton-Raphson iterations with `Y` in Q247.9 format.
1749- Y := and (0x3ff , shr (shift, table))
1750-
1751- // The worst-case seed for `Y` occurs when `Mbucket = 16`. For monotone quadratic
1752- // convergence, we desire that 1/√3 < Y·√M < √(5/3). At the boundaries (worst case)
1753- // of the `Mbucket = 16` range, we are 0.407351 (41.3680%) from the lower bound and
1754- // 0.275987 (27.1906%) from the higher bound.
1755- }
1756-
1757- /// Perform 5 Newton-Raphson iterations. 5 is enough iterations for sufficient
1758- /// convergence that our final fixup step produces an exact result.
1759- // The Newton-Raphson iteration for 1/√M is:
1760- // Y ≈ Y · (3 - M · Y²) / 2
1761- // The implementation of this iteration is deliberately imprecise. No matter how many
1762- // times you run it, you won't converge `Y` on the closest Q3.253 to √M. However, this
1763- // is acceptable because the cleanup step applied after the final call is very tolerant
1764- // of error in the low bits of `Y`.
1765-
1766- // `M` is Q1.255
1767- // `Y` is Q247.9
1768- {
1769- uint256 Y2 = Y * Y; // scale: 2¹⁸
1770- // Because `M` is Q1.255, multiplying `Y2` by `M` and taking the high word
1771- // implicitly divides `MY2` by 2. We move the division by 2 inside the subtraction
1772- // from 3 by adjusting the minuend.
1773- uint256 MY2 = _inaccurateMulHi (M, Y2); // scale: 2¹⁸
1774- uint256 T = 1.5 * 2 ** 18 - MY2; // scale: 2¹⁸
1775- Y *= T; // scale: 2²⁷
1776- }
1777- // `Y` is Q229.27
1778- {
1779- uint256 Y2 = Y * Y; // scale: 2⁵⁴
1780- uint256 MY2 = _inaccurateMulHi (M, Y2); // scale: 2⁵⁴
1781- uint256 T = 1.5 * 2 ** 54 - MY2; // scale: 2⁵⁴
1782- Y *= T; // scale: 2⁸¹
1783- }
1784- // `Y` is Q175.81
1785- {
1786- uint256 Y2 = Y * Y; // scale: 2¹⁶²
1787- uint256 MY2 = _inaccurateMulHi (M, Y2); // scale: 2¹⁶²
1788- uint256 T = 1.5 * 2 ** 162 - MY2; // scale: 2¹⁶²
1789- Y = Y * T >> 116 ; // scale: 2¹²⁷
1790- }
1791- // `Y` is Q129.127
1792- if (invE < 95 - Mbucket) {
1793- // Generally speaking, for relatively smaller `e` (lower values of `x`) and for
1794- // relatively larger `M`, we can skip the 5th N-R iteration. The constant `95` is
1795- // derived by extensive fuzzing. Attempting a higher-order approximation of the
1796- // relationship between `M` and `invE` consumes, on average, more gas. When this
1797- // branch is not taken, the correct bits that this iteration would obtain are
1798- // shifted away during the denormalization step. This branch is net gas-optimizing.
1799- uint256 Y2 = Y * Y; // scale: 2²⁵⁴
1800- uint256 MY2 = _inaccurateMulHi (M, Y2); // scale: 2²⁵⁴
1801- uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
1802- Y = _inaccurateMulHi (Y << 2 , T); // scale: 2¹²⁷
1803- }
1804- // `Y` is Q129.127
1805- {
1806- uint256 Y2 = Y * Y; // scale: 2²⁵⁴
1807- uint256 MY2 = _inaccurateMulHi (M, Y2); // scale: 2²⁵⁴
1808- uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
1809- Y = _inaccurateMulHi (Y << 128 , T); // scale: 2²⁵³
1810- }
1811- // `Y` is Q3.253
1812-
1813- /// When we combine `Y` with `M` to form our approximation of the square root, we have
1814- /// to un-normalize by the half-scale value. This is where even-exponent normalization
1815- /// comes in because the half-scale is integral.
1816- /// M = ⌊x · 2⁽²⁵⁵⁻²ᵉ⁾⌋
1817- /// Y ≈ 2²⁵³ / √(M / 2²⁵⁵)
1818- /// Y ≈ 2³⁸¹ / √(2·M)
1819- /// M·Y ≈ 2³⁸¹ · √(M/2)
1820- /// M·Y ≈ 2⁽⁵⁰⁸⁻ᵉ⁾ · √x
1821- /// r0 ≈ M·Y / 2⁽⁵⁰⁸⁻ᵉ⁾ ≈ ⌊√x⌋
1822- // We shift right by `508 - e` to account for both the Q3.253 scaling and
1823- // denormalization. We don't care about accuracy in the low bits of `r0`, so we can cut
1824- // some corners.
1825- (, uint256 r0 ) = _shr (_inaccurateMulHi (M, Y), 0 , 252 + invE);
1826-
1827- /// `r0` is only an approximation of √x, so we perform a single Babylonian step to fully
1828- /// converge on ⌊√x⌋ or ⌈√x⌉. The Babylonian step is:
1829- /// r = ⌊(r0 + ⌊x/r0⌋) / 2⌋
1830- // Rather than use the more-expensive division routine that returns a 512-bit result,
1831- // because the value the upper word of the quotient can take is highly constrained, we
1832- // can compute the quotient mod 2²⁵⁶ and recover the high word separately. Although
1833- // `_div` does an expensive Newton-Raphson-Hensel modular inversion:
1834- // ⌊x/r0⌋ ≡ ⌊x/2ⁿ⌋·⌊r0/2ⁿ⌋⁻¹ mod 2²⁵⁶ (for r % 2⁽ⁿ⁺¹⁾ = 2ⁿ)
1835- // and we already have a pretty good estimate for r0⁻¹, namely `Y`, refining `Y` into
1836- // the appropriate inverse requires a series of 768-bit multiplications that take more
1837- // gas.
1838- uint256 q_lo = _div (x_hi, x_lo, r0);
1839- uint256 q_hi = (r0 <= x_hi).toUint ();
1840- (uint256 s_hi , uint256 s_lo ) = _add (q_hi, q_lo, r0);
1841- // `oflo` here is either 0 or 1. When `oflo == 1`, `r == 0`, and the correct value for
1842- // `r` is `type(uint256).max`.
1843- uint256 oflo;
1844- (oflo, r) = _shr256 (s_hi, s_lo, 1 );
1845- r -= oflo; // underflow is desired
1846- }
1847- }
1848-
1849- function _sqrtAlt (uint256 x_hi , uint256 x_lo ) private pure returns (uint256 r ) {
18501693 unchecked {
18511694 /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm
18521695 /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
@@ -1931,23 +1774,7 @@ library Lib512MathArithmetic {
19311774 return x_lo.sqrt ();
19321775 }
19331776
1934- uint256 r = _sqrt (x_hi, x_lo);
1935-
1936- // Because the Babylonian step can give ⌈√x⌉ if x+1 is a perfect square, we have to
1937- // check whether we've overstepped by 1 and clamp as appropriate. ref:
1938- // https://en.wikipedia.org/wiki/Integer_square_root#Using_only_integer_division
1939- (uint256 r2_hi , uint256 r2_lo ) = _mul (r, r);
1940- return r.unsafeDec (_gt (r2_hi, r2_lo, x_hi, x_lo));
1941- }
1942-
1943- function sqrtAlt (uint512 x ) internal pure returns (uint256 ) {
1944- (uint256 x_hi , uint256 x_lo ) = x.into ();
1945-
1946- if (x_hi == 0 ) {
1947- return x_lo.sqrt ();
1948- }
1949-
1950- return _sqrtAlt (x_hi, x_lo);
1777+ return _sqrt (x_hi, x_lo);
19511778 }
19521779
19531780 function osqrtUp (uint512 r , uint512 x ) internal pure returns (uint512 ) {
@@ -1959,23 +1786,6 @@ library Lib512MathArithmetic {
19591786
19601787 uint256 r_lo = _sqrt (x_hi, x_lo);
19611788
1962- // The Babylonian step can give ⌈√x⌉ if x+1 is a perfect square. This is
1963- // fine. If the Babylonian step gave ⌊√x⌋ ≠ √x, we have to round up.
1964- (uint256 r2_hi , uint256 r2_lo ) = _mul (r_lo, r_lo);
1965- uint256 r_hi;
1966- (r_hi, r_lo) = _add (0 , r_lo, _gt (x_hi, x_lo, r2_hi, r2_lo).toUint ());
1967- return r.from (r_hi, r_lo);
1968- }
1969-
1970- function osqrtUpAlt (uint512 r , uint512 x ) internal pure returns (uint512 ) {
1971- (uint256 x_hi , uint256 x_lo ) = x.into ();
1972-
1973- if (x_hi == 0 ) {
1974- return r.from (0 , x_lo.sqrtUp ());
1975- }
1976-
1977- uint256 r_lo = _sqrtAlt (x_hi, x_lo);
1978-
19791789 (uint256 r2_hi , uint256 r2_lo ) = _mul (r_lo, r_lo);
19801790 uint256 r_hi;
19811791 (r_hi, r_lo) = _add (0 , r_lo, _gt (x_hi, x_lo, r2_hi, r2_lo).toUint ());
@@ -1986,10 +1796,6 @@ library Lib512MathArithmetic {
19861796 return osqrtUp (r, r);
19871797 }
19881798
1989- function isqrtUpAlt (uint512 r ) internal pure returns (uint512 ) {
1990- return osqrtUpAlt (r, r);
1991- }
1992-
19931799 function oshr (uint512 r , uint512 x , uint256 s ) internal pure returns (uint512 ) {
19941800 (uint256 x_hi , uint256 x_lo ) = x.into ();
19951801 (uint256 r_hi , uint256 r_lo ) = _shr (x_hi, x_lo, s);
0 commit comments