Skip to content

Commit aa5b92d

Browse files
committed
Remove the old 512-bit sqrt functions, replace them with the new, optimized ones
1 parent bf12bb8 commit aa5b92d

File tree

2 files changed

+1
-237
lines changed

2 files changed

+1
-237
lines changed

src/utils/512Math.sol

Lines changed: 1 addition & 195 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,164 +1689,7 @@ library Lib512MathArithmetic {
16891689
return omodAlt(r, y, r);
16901690
}
16911691

1692-
// hi ≈ x · y / 2²⁵⁶ (±1)
1693-
function _inaccurateMulHi(uint256 x, uint256 y) private pure returns (uint256 hi) {
1694-
assembly ("memory-safe") {
1695-
hi := sub(mulmod(x, y, not(0x00)), mul(x, y))
1696-
}
1697-
}
1698-
1699-
// gas benchmark 2025/09/20: ~1425 gas
17001692
function _sqrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
1701-
/// Our general approach here is to compute the inverse of the square root of the argument
1702-
/// using Newton-Raphson iterations. Then we combine (multiply) this inverse square root
1703-
/// approximation with the argument to approximate the square root of the argument. After
1704-
/// that, a final fixup step is applied to get the exact result. We compute the inverse of
1705-
/// the square root rather than the square root directly because then our Newton-Raphson
1706-
/// iteration can avoid the extremely expensive 512-bit division subroutine.
1707-
unchecked {
1708-
/// First, we normalize `x` by separating it into a mantissa and exponent. We use
1709-
/// even-exponent normalization.
1710-
1711-
// `e` is half the exponent of `x`
1712-
// e = ⌊bitlength(x)/2⌋
1713-
// invE = 256 - e
1714-
uint256 invE = (x_hi.clz() + 1) >> 1; // invE ∈ [0, 128]
1715-
1716-
// Extract mantissa M by shifting x right by 2·e - 255 bits
1717-
// `M` is the mantissa of `x` as a Q1.255; M ∈ [½, 2)
1718-
(, uint256 M) = _shr(x_hi, x_lo, 257 - (invE << 1)); // scale: 2⁽²⁵⁵⁻²ᵉ⁾
1719-
1720-
/// Pick an initial estimate (seed) for Y using a lookup table. Even-exponent
1721-
/// normalization means our mantissa is geometrically symmetric around 1, leading to 16
1722-
/// buckets on the low side and 32 buckets on the high side.
1723-
// `Y` _ultimately_ approximates the inverse square root of fixnum `M` as a
1724-
// Q3.253. However, as a gas optimization, the number of fractional bits in `Y` rises
1725-
// through the steps, giving an inhomogeneous fixed-point representation. Y ≈∈ [√½, √2]
1726-
uint256 Y; // scale: 2⁽²⁵³⁺ᵉ⁾
1727-
uint256 Mbucket;
1728-
assembly ("memory-safe") {
1729-
// Extract the upper 6 bits of `M` to be used as a table index. `M >> 250 < 16` is
1730-
// invalid (that would imply M<½), so our lookup table only needs to handle only 16
1731-
// through 63.
1732-
Mbucket := shr(0xfa, M)
1733-
// We can't fit 48 seeds into a single word, so we split the table in 2 and use `c`
1734-
// to select which table we index.
1735-
let c := lt(0x27, Mbucket)
1736-
1737-
// Each entry is 10 bits and the entries are ordered from lowest `i` to highest. The
1738-
// seed is the value for `Y` for the midpoint of the bucket, rounded to 10
1739-
// significant bits. That is, Y ≈ 1/√(2·M_mid), as a Q247.9. The 2 comes from the
1740-
// half-scale difference between Y and √M. The optimality of this choice was
1741-
// verified by fuzzing.
1742-
let table_hi := 0x71dc26f1b76c9ad6a5a46819c661946418c621856057e5ed775d1715b96b
1743-
let table_lo := 0xb26b4a8690a027198e559263e8ce2887e15832047f1f47b5e677dd974dcd
1744-
let table := xor(table_lo, mul(xor(table_hi, table_lo), c))
1745-
1746-
// Index the table to obtain the initial seed of `Y`.
1747-
let shift := add(0x186, mul(0x0a, sub(mul(0x18, c), Mbucket)))
1748-
// We begin the Newton-Raphson iterations with `Y` in Q247.9 format.
1749-
Y := and(0x3ff, shr(shift, table))
1750-
1751-
// The worst-case seed for `Y` occurs when `Mbucket = 16`. For monotone quadratic
1752-
// convergence, we desire that 1/√3 < Y·√M < √(5/3). At the boundaries (worst case)
1753-
// of the `Mbucket = 16` range, we are 0.407351 (41.3680%) from the lower bound and
1754-
// 0.275987 (27.1906%) from the higher bound.
1755-
}
1756-
1757-
/// Perform 5 Newton-Raphson iterations. 5 is enough iterations for sufficient
1758-
/// convergence that our final fixup step produces an exact result.
1759-
// The Newton-Raphson iteration for 1/√M is:
1760-
// Y ≈ Y · (3 - M · Y²) / 2
1761-
// The implementation of this iteration is deliberately imprecise. No matter how many
1762-
// times you run it, you won't converge `Y` on the closest Q3.253 to √M. However, this
1763-
// is acceptable because the cleanup step applied after the final call is very tolerant
1764-
// of error in the low bits of `Y`.
1765-
1766-
// `M` is Q1.255
1767-
// `Y` is Q247.9
1768-
{
1769-
uint256 Y2 = Y * Y; // scale: 2¹⁸
1770-
// Because `M` is Q1.255, multiplying `Y2` by `M` and taking the high word
1771-
// implicitly divides `MY2` by 2. We move the division by 2 inside the subtraction
1772-
// from 3 by adjusting the minuend.
1773-
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁸
1774-
uint256 T = 1.5 * 2 ** 18 - MY2; // scale: 2¹⁸
1775-
Y *= T; // scale: 2²⁷
1776-
}
1777-
// `Y` is Q229.27
1778-
{
1779-
uint256 Y2 = Y * Y; // scale: 2⁵⁴
1780-
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2⁵⁴
1781-
uint256 T = 1.5 * 2 ** 54 - MY2; // scale: 2⁵⁴
1782-
Y *= T; // scale: 2⁸¹
1783-
}
1784-
// `Y` is Q175.81
1785-
{
1786-
uint256 Y2 = Y * Y; // scale: 2¹⁶²
1787-
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁶²
1788-
uint256 T = 1.5 * 2 ** 162 - MY2; // scale: 2¹⁶²
1789-
Y = Y * T >> 116; // scale: 2¹²⁷
1790-
}
1791-
// `Y` is Q129.127
1792-
if (invE < 95 - Mbucket) {
1793-
// Generally speaking, for relatively smaller `e` (lower values of `x`) and for
1794-
// relatively larger `M`, we can skip the 5th N-R iteration. The constant `95` is
1795-
// derived by extensive fuzzing. Attempting a higher-order approximation of the
1796-
// relationship between `M` and `invE` consumes, on average, more gas. When this
1797-
// branch is not taken, the correct bits that this iteration would obtain are
1798-
// shifted away during the denormalization step. This branch is net gas-optimizing.
1799-
uint256 Y2 = Y * Y; // scale: 2²⁵⁴
1800-
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
1801-
uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
1802-
Y = _inaccurateMulHi(Y << 2, T); // scale: 2¹²⁷
1803-
}
1804-
// `Y` is Q129.127
1805-
{
1806-
uint256 Y2 = Y * Y; // scale: 2²⁵⁴
1807-
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
1808-
uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
1809-
Y = _inaccurateMulHi(Y << 128, T); // scale: 2²⁵³
1810-
}
1811-
// `Y` is Q3.253
1812-
1813-
/// When we combine `Y` with `M` to form our approximation of the square root, we have
1814-
/// to un-normalize by the half-scale value. This is where even-exponent normalization
1815-
/// comes in because the half-scale is integral.
1816-
/// M = ⌊x · 2⁽²⁵⁵⁻²ᵉ⁾⌋
1817-
/// Y ≈ 2²⁵³ / √(M / 2²⁵⁵)
1818-
/// Y ≈ 2³⁸¹ / √(2·M)
1819-
/// M·Y ≈ 2³⁸¹ · √(M/2)
1820-
/// M·Y ≈ 2⁽⁵⁰⁸⁻ᵉ⁾ · √x
1821-
/// r0 ≈ M·Y / 2⁽⁵⁰⁸⁻ᵉ⁾ ≈ ⌊√x⌋
1822-
// We shift right by `508 - e` to account for both the Q3.253 scaling and
1823-
// denormalization. We don't care about accuracy in the low bits of `r0`, so we can cut
1824-
// some corners.
1825-
(, uint256 r0) = _shr(_inaccurateMulHi(M, Y), 0, 252 + invE);
1826-
1827-
/// `r0` is only an approximation of √x, so we perform a single Babylonian step to fully
1828-
/// converge on ⌊√x⌋ or ⌈√x⌉. The Babylonian step is:
1829-
/// r = ⌊(r0 + ⌊x/r0⌋) / 2⌋
1830-
// Rather than use the more-expensive division routine that returns a 512-bit result,
1831-
// because the value the upper word of the quotient can take is highly constrained, we
1832-
// can compute the quotient mod 2²⁵⁶ and recover the high word separately. Although
1833-
// `_div` does an expensive Newton-Raphson-Hensel modular inversion:
1834-
// ⌊x/r0⌋ ≡ ⌊x/2ⁿ⌋·⌊r0/2ⁿ⌋⁻¹ mod 2²⁵⁶ (for r % 2⁽ⁿ⁺¹⁾ = 2ⁿ)
1835-
// and we already have a pretty good estimate for r0⁻¹, namely `Y`, refining `Y` into
1836-
// the appropriate inverse requires a series of 768-bit multiplications that take more
1837-
// gas.
1838-
uint256 q_lo = _div(x_hi, x_lo, r0);
1839-
uint256 q_hi = (r0 <= x_hi).toUint();
1840-
(uint256 s_hi, uint256 s_lo) = _add(q_hi, q_lo, r0);
1841-
// `oflo` here is either 0 or 1. When `oflo == 1`, `r == 0`, and the correct value for
1842-
// `r` is `type(uint256).max`.
1843-
uint256 oflo;
1844-
(oflo, r) = _shr256(s_hi, s_lo, 1);
1845-
r -= oflo; // underflow is desired
1846-
}
1847-
}
1848-
1849-
function _sqrtAlt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
18501693
unchecked {
18511694
/// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm
18521695
/// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
@@ -1931,23 +1774,7 @@ library Lib512MathArithmetic {
19311774
return x_lo.sqrt();
19321775
}
19331776

1934-
uint256 r = _sqrt(x_hi, x_lo);
1935-
1936-
// Because the Babylonian step can give ⌈√x⌉ if x+1 is a perfect square, we have to
1937-
// check whether we've overstepped by 1 and clamp as appropriate. ref:
1938-
// https://en.wikipedia.org/wiki/Integer_square_root#Using_only_integer_division
1939-
(uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
1940-
return r.unsafeDec(_gt(r2_hi, r2_lo, x_hi, x_lo));
1941-
}
1942-
1943-
function sqrtAlt(uint512 x) internal pure returns (uint256) {
1944-
(uint256 x_hi, uint256 x_lo) = x.into();
1945-
1946-
if (x_hi == 0) {
1947-
return x_lo.sqrt();
1948-
}
1949-
1950-
return _sqrtAlt(x_hi, x_lo);
1777+
return _sqrt(x_hi, x_lo);
19511778
}
19521779

19531780
function osqrtUp(uint512 r, uint512 x) internal pure returns (uint512) {
@@ -1959,23 +1786,6 @@ library Lib512MathArithmetic {
19591786

19601787
uint256 r_lo = _sqrt(x_hi, x_lo);
19611788

1962-
// The Babylonian step can give ⌈√x⌉ if x+1 is a perfect square. This is
1963-
// fine. If the Babylonian step gave ⌊√x⌋ ≠ √x, we have to round up.
1964-
(uint256 r2_hi, uint256 r2_lo) = _mul(r_lo, r_lo);
1965-
uint256 r_hi;
1966-
(r_hi, r_lo) = _add(0, r_lo, _gt(x_hi, x_lo, r2_hi, r2_lo).toUint());
1967-
return r.from(r_hi, r_lo);
1968-
}
1969-
1970-
function osqrtUpAlt(uint512 r, uint512 x) internal pure returns (uint512) {
1971-
(uint256 x_hi, uint256 x_lo) = x.into();
1972-
1973-
if (x_hi == 0) {
1974-
return r.from(0, x_lo.sqrtUp());
1975-
}
1976-
1977-
uint256 r_lo = _sqrtAlt(x_hi, x_lo);
1978-
19791789
(uint256 r2_hi, uint256 r2_lo) = _mul(r_lo, r_lo);
19801790
uint256 r_hi;
19811791
(r_hi, r_lo) = _add(0, r_lo, _gt(x_hi, x_lo, r2_hi, r2_lo).toUint());
@@ -1986,10 +1796,6 @@ library Lib512MathArithmetic {
19861796
return osqrtUp(r, r);
19871797
}
19881798

1989-
function isqrtUpAlt(uint512 r) internal pure returns (uint512) {
1990-
return osqrtUpAlt(r, r);
1991-
}
1992-
19931799
function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) {
19941800
(uint256 x_hi, uint256 x_lo) = x.into();
19951801
(uint256 r_hi, uint256 r_lo) = _shr(x_hi, x_lo, s);

test/0.8.25/512Math.t.sol

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -239,26 +239,6 @@ contract Lib512MathTest is Test {
239239
}
240240
}
241241

242-
function test512Math_sqrtAlt(uint256 x_hi, uint256 x_lo) external pure {
243-
uint512 x = alloc().from(x_hi, x_lo);
244-
uint256 r = x.sqrtAlt();
245-
246-
(uint256 r2_lo, uint256 r2_hi) = SlowMath.fullMul(r, r);
247-
assertTrue((r2_hi < x_hi) || (r2_hi == x_hi && r2_lo <= x_lo), "sqrtAlt too high");
248-
249-
if (r == type(uint256).max) {
250-
assertTrue(
251-
x_hi > 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe
252-
|| (x_hi == 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe && x_lo != 0),
253-
"sqrtAlt too low (overflow)"
254-
);
255-
} else {
256-
r++;
257-
(r2_lo, r2_hi) = SlowMath.fullMul(r, r);
258-
assertTrue((r2_hi > x_hi) || (r2_hi == x_hi && r2_lo > x_lo), "sqrtAlt too low");
259-
}
260-
}
261-
262242
function test512Math_divUpAlt(uint256 x_hi, uint256 x_lo, uint256 y_hi, uint256 y_lo) external view {
263243
vm.assume(y_hi != 0);
264244

@@ -442,28 +422,6 @@ contract Lib512MathTest is Test {
442422
}
443423
}
444424

445-
function test512Math_osqrtUpAlt(uint256 x_hi, uint256 x_lo) external pure {
446-
uint512 x = alloc().from(x_hi, x_lo);
447-
(uint256 r_hi, uint256 r_lo) = alloc().osqrtUpAlt(x).into();
448-
449-
if (r_hi == 0 && r_lo == 0) {
450-
assertTrue(x_hi == 0 && x_lo == 0, "sqrtUpAlt of nonzero is zero");
451-
} else if (r_hi != 0) {
452-
assertTrue(r_hi == 1 && r_lo == 0, "overflow result must be exactly 2^256");
453-
(uint256 r_dec2_lo, uint256 r_dec2_hi) = SlowMath.fullMul(type(uint256).max, type(uint256).max);
454-
assertTrue((r_dec2_hi < x_hi) || (r_dec2_hi == x_hi && r_dec2_lo < x_lo), "sqrtUpAlt too high");
455-
} else {
456-
(uint256 r2_lo, uint256 r2_hi) = SlowMath.fullMul(r_lo, r_lo);
457-
assertTrue((r2_hi > x_hi) || (r2_hi == x_hi && r2_lo >= x_lo), "sqrtUpAlt too low");
458-
459-
if (r_lo != 1) {
460-
uint256 r_dec_lo = r_lo - 1;
461-
(r2_lo, r2_hi) = SlowMath.fullMul(r_dec_lo, r_dec_lo);
462-
assertTrue((r2_hi < x_hi) || (r2_hi == x_hi && r2_lo < x_lo), "sqrtUpAlt too high");
463-
}
464-
}
465-
}
466-
467425
function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure {
468426
s = bound(s, 0, 512);
469427

0 commit comments

Comments
 (0)