From 8297d2edd1afb970dbe620c89df078aa0286e283 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:17:47 +0100 Subject: [PATCH 01/38] WIP: 512Math cbrt --- src/vendor/Cbrt.sol | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/vendor/Cbrt.sol diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol new file mode 100644 index 000000000..cb0f80e0f --- /dev/null +++ b/src/vendor/Cbrt.sol @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.25; + +// @author Modified from Solady by Vectorized and Akshay Tarpara https://github.com/Vectorized/solady/blob/ff6256a18851749e765355b3e21dc9bfa417255b/src/utils/clz/FixedPointMathLib.sol#L799-L822 under the MIT license. +library Cbrt { + /// @dev Returns the cube root of `x`, rounded down. + /// Credit to bout3fiddy and pcaversaccio under AGPLv3 license: + /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy + /// Formally verified by xuwinnie: + /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf + function cbrt(uint256 x) internal pure returns (uint256 z) { + /// @solidity memory-safe-assembly + assembly ("memory-safe") { + // Initial guess z = 2^ceil((log2(x) + 2) / 3). + // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1) + // computes this over-estimate. Guaranteed ≥ cbrt(x) and safe for Newton-Raphson's. + z := shl(div(sub(257, clz(x)), 3), 1) + // Newton-Raphson's. + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + z := div(add(add(div(x, mul(z, z)), z), z), 3) + // Round down. + z := sub(z, lt(div(x, mul(z, z)), z)) + } + } +} From 2370b466422eab80462cda8cc306410a4259b422 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:31:10 +0100 Subject: [PATCH 02/38] WIP: 512Math cbrt --- test/0.8.25/512Math.t.sol | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index d202c4518..af23c317a 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -422,6 +422,21 @@ contract Lib512MathTest is Test { } } + function test512Math_cbrt(uint256 x_hi, uint256 x_lo) external pure { + uint512 x = alloc().from(x_hi, x_lo); + uint256 r = x.cbrt(); + uint512 r3 = alloc().omul(r, r).imul(r); + + assertTrue(r3 <= x, "cbrt too high"); + if (!(x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff))) { + r++; + r3.omul(r, r).imul(r); + assertTrue(r3 > x, "cbrt too low"); + } + } + function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure { s = bound(s, 0, 512); From f045d800f5178d3188316cb541ca4d235f14723b Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:33:46 +0100 Subject: [PATCH 03/38] WIP: 512Math cbrt --- src/utils/512Math.sol | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index c63961635..b86b68deb 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -7,6 +7,7 @@ import {Clz} from "../vendor/Clz.sol"; import {Ternary} from "./Ternary.sol"; import {FastLogic} from "./FastLogic.sol"; import {Sqrt} from "../vendor/Sqrt.sol"; +import {Cbrt} from "../vendor/Cbrt.sol"; /* @@ -180,6 +181,10 @@ WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING /// * osqrtUp(uint512,uint512) /// * isqrtUp(uint512) /// +/// ### Cube root +/// +/// * cbrt(uint512) returns (uint256) +/// /// ### Shifting /// /// * oshr(uint512,uint512,uint256) @@ -358,6 +363,7 @@ library Lib512MathArithmetic { using Ternary for bool; using FastLogic for bool; using Sqrt for uint256; + using Cbrt for uint256; function _add(uint256 x, uint256 y) private pure returns (uint256 r_hi, uint256 r_lo) { assembly ("memory-safe") { @@ -1809,6 +1815,77 @@ library Lib512MathArithmetic { return osqrtUp(r, r); } + function _cubeGt(uint256 r, uint256 x_hi, uint256 x_lo) private pure returns (bool isGreater) { + (uint256 r2_hi, uint256 r2_lo) = _mul(r, r); + assembly ("memory-safe") { + let mm := mulmod(r2_lo, r, not(0x00)) + let lo := mul(r2_lo, r) + let hi_lo := sub(sub(mm, lo), lt(mm, lo)) + + mm := mulmod(r2_hi, r, not(0x00)) + let cross_lo := mul(r2_hi, r) + let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo)) + + let hi := add(hi_lo, cross_lo) + let ext := add(cross_hi, lt(hi, hi_lo)) + + isGreater := or(ext, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) + } + } + + function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) { + unchecked { + // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input + // for the top-limb cube root extraction. + uint256 shift = x_hi.clz(); + (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift - shift % 3); + shift /= 3; + + // Split x into base B = 2^86 "limbs": + // x = x3 * B^3 + x2 * B^2 + x1 * B + x0 + // where x3 is 254 bits and x2/x1/x0 are 86-bit limbs. + uint256 x3 = x_hi >> 2; + uint256 x2; + assembly ("memory-safe") { + x2 := or(shl(84, and(x_hi, 0x03)), shr(172, x_lo)) + } + + // First limb from 256-bit cbrt. + uint256 r_hi = x3.cbrt(); + uint256 r_hi_sq = r_hi * r_hi; + uint256 res = x3 - r_hi_sq * r_hi; + uint256 d = 3 * r_hi_sq; + + // Recover low 86-bit limb from: + // floor((res * B + x2) / (3 * r_hi^2)). + uint256 n_hi = res >> 170; + uint256 n_lo = (res << 86) | x2; + uint256 r_lo = n_hi == 0 ? n_lo / d : _div(n_hi, n_lo, d); + + // Second-order correction for the omitted quadratic term in the first-limb lift. + // With B = 2^86, q := r_lo, and a := r_hi: + // q' = q - floor(q^2 / (a * B)). + // This removes almost all of the overshoot from the first-order estimate. + r_lo -= (r_lo * r_lo) / (r_hi << 86); + r = (r_hi << 86) + r_lo; + + // Exact floor correction. + r = r.unsafeDec(_cubeGt(r, x_hi, x_lo)); + + return r >> shift; + } + } + + function cbrt(uint512 x) internal pure returns (uint256) { + (uint256 x_hi, uint256 x_lo) = x.into(); + + if (x_hi == 0) { + return x_lo.cbrt(); + } + + return _cbrt(x_hi, x_lo); + } + function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) { (uint256 x_hi, uint256 x_lo) = x.into(); (uint256 r_hi, uint256 r_lo) = _shr(x_hi, x_lo, s); From 8a7905be7f4183d4b1b6db7a5ac74ab27fdea022 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:44:42 +0100 Subject: [PATCH 04/38] Ignore spam solc error codes --- foundry.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/foundry.toml b/foundry.toml index 4cff5a01d..d86543996 100644 --- a/foundry.toml +++ b/foundry.toml @@ -1,5 +1,6 @@ [profile.default] unchecked_cheatcode_artifacts = true +ignored_error_codes = [2424, 4591] # https://github.com/foundry-rs/foundry/issues/6780#issuecomment-1962319449 bytecode_hash = "none" From 4ef89a66314d2c8bf48dc21c4e0c16e504d2539f Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:45:07 +0100 Subject: [PATCH 05/38] Cleanup --- src/utils/512Math.sol | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index b86b68deb..1f6863339 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1696,12 +1696,10 @@ library Lib512MathArithmetic { } function _sqrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) { + /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm + /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and + /// 512Math. This approach is inspired by https://github.com/SimonSuckut/Solidity_Uint512/ unchecked { - /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm - /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and - /// 512Math. This approach is inspired by - /// https://github.com/SimonSuckut/Solidity_Uint512/ - // Normalize `x` so the top word has its MSB in bit 255 or 254. // x ≥ 2⁵¹⁰ uint256 shift = x_hi.clz(); @@ -1815,25 +1813,27 @@ library Lib512MathArithmetic { return osqrtUp(r, r); } - function _cubeGt(uint256 r, uint256 x_hi, uint256 x_lo) private pure returns (bool isGreater) { - (uint256 r2_hi, uint256 r2_lo) = _mul(r, r); + function _cubeGt(uint256 a, uint256 b_hi, uint256 b_lo) private pure returns (bool r) { + (uint256 a2_hi, uint256 a2_lo) = _mul(a, a); assembly ("memory-safe") { - let mm := mulmod(r2_lo, r, not(0x00)) - let lo := mul(r2_lo, r) + let mm := mulmod(a2_lo, a, not(0x00)) + let lo := mul(a2_lo, a) let hi_lo := sub(sub(mm, lo), lt(mm, lo)) - mm := mulmod(r2_hi, r, not(0x00)) - let cross_lo := mul(r2_hi, r) + mm := mulmod(a2_hi, a, not(0x00)) + let cross_lo := mul(a2_hi, a) let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo)) let hi := add(hi_lo, cross_lo) let ext := add(cross_hi, lt(hi, hi_lo)) - isGreater := or(ext, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) + r := or(ext, or(gt(hi, b_hi), and(eq(hi, b_hi), gt(lo, b_lo)))) } } function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) { + /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's + /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead. unchecked { // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input // for the top-limb cube root extraction. @@ -1860,13 +1860,13 @@ library Lib512MathArithmetic { // floor((res * B + x2) / (3 * r_hi^2)). uint256 n_hi = res >> 170; uint256 n_lo = (res << 86) | x2; - uint256 r_lo = n_hi == 0 ? n_lo / d : _div(n_hi, n_lo, d); + uint256 r_lo = n_hi == 0 ? n_lo.unsafeDiv(d) : _div(n_hi, n_lo, d); // Second-order correction for the omitted quadratic term in the first-limb lift. // With B = 2^86, q := r_lo, and a := r_hi: // q' = q - floor(q^2 / (a * B)). // This removes almost all of the overshoot from the first-order estimate. - r_lo -= (r_lo * r_lo) / (r_hi << 86); + r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86); r = (r_hi << 86) + r_lo; // Exact floor correction. From cd876e1b478db1d1f6622ec312a5f21073cd91e6 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:52:49 +0100 Subject: [PATCH 06/38] Golf --- src/utils/512Math.sol | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 1f6863339..0fa300514 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1858,9 +1858,23 @@ library Lib512MathArithmetic { // Recover low 86-bit limb from: // floor((res * B + x2) / (3 * r_hi^2)). - uint256 n_hi = res >> 170; - uint256 n_lo = (res << 86) | x2; - uint256 r_lo = n_hi == 0 ? n_lo.unsafeDiv(d) : _div(n_hi, n_lo, d); + // `res * B + x2` is up to 257 bits. Here `c := res >> 170` is a single + // carry bit (`res < 2^171` for this normalized range), so we can avoid + // 512-bit division by folding that carry manually, exactly as in `_sqrt`. + uint256 r_lo; + assembly ("memory-safe") { + let n := or(shl(86, res), x2) + r_lo := div(n, d) + + let c := shr(170, res) + if c { + let neg_c := not(0x00) + let rem := mod(n, d) + r_lo := add(r_lo, div(neg_c, d)) + rem := add(rem, add(0x01, mod(neg_c, d))) + r_lo := add(r_lo, div(rem, d)) + } + } // Second-order correction for the omitted quadratic term in the first-limb lift. // With B = 2^86, q := r_lo, and a := r_hi: From 669696cbe8090f377898f0ac33321a39cf4959d3 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 21:59:43 +0100 Subject: [PATCH 07/38] Golf --- src/utils/512Math.sol | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 0fa300514..c870b8cf4 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1851,7 +1851,21 @@ library Lib512MathArithmetic { } // First limb from 256-bit cbrt. - uint256 r_hi = x3.cbrt(); + // Here x3 is always in [2^251, 2^254), so the Solady seed + // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85. + uint256 r_hi; + assembly ("memory-safe") { + r_hi := shl(shr(253, x3), shl(84, 1)) + + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := sub(r_hi, lt(div(x3, mul(r_hi, r_hi)), r_hi)) + } uint256 r_hi_sq = r_hi * r_hi; uint256 res = x3 - r_hi_sq * r_hi; uint256 d = 3 * r_hi_sq; From aa9499f6648ce3954b56ef0fa5f50128a9da3ea7 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 22:01:42 +0100 Subject: [PATCH 08/38] Golf --- src/utils/512Math.sol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index c870b8cf4..862e8819d 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1855,7 +1855,7 @@ library Lib512MathArithmetic { // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85. uint256 r_hi; assembly ("memory-safe") { - r_hi := shl(shr(253, x3), shl(84, 1)) + r_hi := 0x1000000000000000000000 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) From 00660e3c154ab9060920b3f13500f0047805f7c1 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 22:37:04 +0100 Subject: [PATCH 09/38] Golf --- src/utils/512Math.sol | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 862e8819d..c180a09fd 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1813,24 +1813,6 @@ library Lib512MathArithmetic { return osqrtUp(r, r); } - function _cubeGt(uint256 a, uint256 b_hi, uint256 b_lo) private pure returns (bool r) { - (uint256 a2_hi, uint256 a2_lo) = _mul(a, a); - assembly ("memory-safe") { - let mm := mulmod(a2_lo, a, not(0x00)) - let lo := mul(a2_lo, a) - let hi_lo := sub(sub(mm, lo), lt(mm, lo)) - - mm := mulmod(a2_hi, a, not(0x00)) - let cross_lo := mul(a2_hi, a) - let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo)) - - let hi := add(hi_lo, cross_lo) - let ext := add(cross_hi, lt(hi, hi_lo)) - - r := or(ext, or(gt(hi, b_hi), and(eq(hi, b_hi), gt(lo, b_lo)))) - } - } - function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) { /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead. @@ -1898,7 +1880,20 @@ library Lib512MathArithmetic { r = (r_hi << 86) + r_lo; // Exact floor correction. - r = r.unsafeDec(_cubeGt(r, x_hi, x_lo)); + assembly ("memory-safe") { + let mm0 := mulmod(r, r, not(0x00)) + let r2_lo := mul(r, r) + let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + + let mm1 := mulmod(r2_lo, r, not(0x00)) + let lo := mul(r2_lo, r) + let hi_lo := sub(sub(mm1, lo), lt(mm1, lo)) + + let cross_lo := mul(r2_hi, r) + let hi := add(hi_lo, cross_lo) + + r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) + } return r >> shift; } From e783514e0ba1f2a3167aa8b2f37cfda714c3e277 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 22:45:57 +0100 Subject: [PATCH 10/38] Golf --- src/utils/512Math.sol | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index c180a09fd..78055aba2 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1819,9 +1819,8 @@ library Lib512MathArithmetic { unchecked { // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input // for the top-limb cube root extraction. - uint256 shift = x_hi.clz(); - (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift - shift % 3); - shift /= 3; + uint256 shift = x_hi.clz() / 3; + (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3); // Split x into base B = 2^86 "limbs": // x = x3 * B^3 + x2 * B^2 + x1 * B + x0 From 8cd4b21adcc09d55ec80843fd5f01461a04e0711 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Mon, 23 Feb 2026 22:58:54 +0100 Subject: [PATCH 11/38] Cleanup --- src/vendor/Cbrt.sol | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol index cb0f80e0f..3604c919c 100644 --- a/src/vendor/Cbrt.sol +++ b/src/vendor/Cbrt.sol @@ -9,7 +9,6 @@ library Cbrt { /// Formally verified by xuwinnie: /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf function cbrt(uint256 x) internal pure returns (uint256 z) { - /// @solidity memory-safe-assembly assembly ("memory-safe") { // Initial guess z = 2^ceil((log2(x) + 2) / 3). // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1) From b6445ee69faaa8aa9c4a05e213b78be340743adb Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 08:54:20 +0100 Subject: [PATCH 12/38] Golf --- src/utils/512Math.sol | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 78055aba2..094e229c0 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1835,6 +1835,8 @@ library Lib512MathArithmetic { // Here x3 is always in [2^251, 2^254), so the Solady seed // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85. uint256 r_hi; + uint256 res; + uint256 d; assembly ("memory-safe") { r_hi := 0x1000000000000000000000 @@ -1844,12 +1846,26 @@ library Lib512MathArithmetic { r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := sub(r_hi, lt(div(x3, mul(r_hi, r_hi)), r_hi)) + + let r_hi_sq := mul(r_hi, r_hi) + let r_hi_cube := mul(r_hi_sq, r_hi) + if gt(r_hi_cube, x3) { + // We gate the 7th Newton-Raphson step on whether it has overestimated. The + // second-order correction below is sufficient to correct for small + // underestimation. This branch is net gas-optimizing. + r_hi := div(add(add(div(x3, r_hi_sq), r_hi), r_hi), 3) + r_hi_sq := mul(r_hi, r_hi) + r_hi_cube := mul(r_hi_sq, r_hi) + if gt(r_hi_cube, x3) { + r_hi := sub(r_hi, 0x01) + r_hi_sq := mul(r_hi, r_hi) + r_hi_cube := mul(r_hi_sq, r_hi) + } + } + + res := sub(x3, r_hi_cube) + d := mul(3, r_hi_sq) } - uint256 r_hi_sq = r_hi * r_hi; - uint256 res = x3 - r_hi_sq * r_hi; - uint256 d = 3 * r_hi_sq; // Recover low 86-bit limb from: // floor((res * B + x2) / (3 * r_hi^2)). From a27a7e0b7a207ea5781168a19fa7caa8573f8a6b Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 09:31:04 +0100 Subject: [PATCH 13/38] Cleanup --- src/utils/512Math.sol | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 094e229c0..d0f35ec02 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1700,7 +1700,8 @@ library Lib512MathArithmetic { /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and /// 512Math. This approach is inspired by https://github.com/SimonSuckut/Solidity_Uint512/ unchecked { - // Normalize `x` so the top word has its MSB in bit 255 or 254. + // Normalize `x` so the top word has its MSB in bit 255 or 254. This makes the "shift + // back" step exact. // x ≥ 2⁵¹⁰ uint256 shift = x_hi.clz(); (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift & 0xfe); @@ -1817,67 +1818,69 @@ library Lib512MathArithmetic { /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead. unchecked { - // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input - // for the top-limb cube root extraction. + // Normalize `x` so that its MSB is in bit 255, 254, or 253. This makes the left shift a + // multiple of 3 so that the "shift back" un-normalization step is exact. + // x ≥ 2⁵⁰⁹ uint256 shift = x_hi.clz() / 3; (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3); - // Split x into base B = 2^86 "limbs": - // x = x3 * B^3 + x2 * B^2 + x1 * B + x0 - // where x3 is 254 bits and x2/x1/x0 are 86-bit limbs. - uint256 x3 = x_hi >> 2; - uint256 x2; + // Zimmerman's "Karatsuba Square Root" algorithm works with limbs of `r` that are half + // of a word. For cube root, we use limbs of `r` that are one third of a word. The + // initial step to compute the first "limb" of `r` uses the "normal" cube root algorithm + // and consumes the first (almost) word of `x`. The second and final limb of `r` is + // computed using an analogue of the Karatsuba step from the original algorithm, + // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after + // the first whole-ish word. + uint256 limb_hi; assembly ("memory-safe") { - x2 := or(shl(84, and(x_hi, 0x03)), shr(172, x_lo)) + limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo)) } - // First limb from 256-bit cbrt. - // Here x3 is always in [2^251, 2^254), so the Solady seed - // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85. + // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we + // store in `r_hi`. `res` is the residue after this first operation and `d` is the + // derivative/denominator for the subsequent outer Newton step of the Karatsuba "loop". uint256 r_hi; uint256 res; uint256 d; assembly ("memory-safe") { - r_hi := 0x1000000000000000000000 + let w := shr(2, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization + r_hi := 0x1000000000000000000000 // Given `w` in its range, this seed is suitable - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) - r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) + r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) let r_hi_sq := mul(r_hi, r_hi) let r_hi_cube := mul(r_hi_sq, r_hi) - if gt(r_hi_cube, x3) { - // We gate the 7th Newton-Raphson step on whether it has overestimated. The - // second-order correction below is sufficient to correct for small - // underestimation. This branch is net gas-optimizing. - r_hi := div(add(add(div(x3, r_hi_sq), r_hi), r_hi), 3) + if gt(r_hi_cube, w) { + // We gate the 7th Newton-Raphson step and ceil/floor cleanup on whether it has + // overestimated. The second-order correction further below is sufficient to + // correct for small underestimation. This branch is net gas-optimizing. + r_hi := div(add(add(div(w, r_hi_sq), r_hi), r_hi), 0x03) + r_hi := sub(r_hi, lt(div(w, mul(r_hi, r_hi)), r_hi)) r_hi_sq := mul(r_hi, r_hi) r_hi_cube := mul(r_hi_sq, r_hi) - if gt(r_hi_cube, x3) { - r_hi := sub(r_hi, 0x01) - r_hi_sq := mul(r_hi, r_hi) - r_hi_cube := mul(r_hi_sq, r_hi) - } } - res := sub(x3, r_hi_cube) - d := mul(3, r_hi_sq) + res := sub(w, r_hi_cube) + d := mul(0x03, r_hi_sq) } - // Recover low 86-bit limb from: - // floor((res * B + x2) / (3 * r_hi^2)). - // `res * B + x2` is up to 257 bits. Here `c := res >> 170` is a single - // carry bit (`res < 2^171` for this normalized range), so we can avoid - // 512-bit division by folding that carry manually, exactly as in `_sqrt`. + // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost): + // r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋ + // res = (res ⋅ 2⁸⁶ + limb_hi) % (3 ⋅ r_hi²) + // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root step. uint256 r_lo; assembly ("memory-safe") { - let n := or(shl(86, res), x2) + let n := or(shl(0x56, res), limb_hi) r_lo := div(n, d) - let c := shr(170, res) + let c := shr(0xaa, res) + // If `res` was 171 bits (one more than expected), then `n` overflowed to 257 + // bits. Explicitly handling the carry avoids 512-bit division. if c { let neg_c := not(0x00) let rem := mod(n, d) @@ -1910,6 +1913,7 @@ library Lib512MathArithmetic { r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) } + // Un-normalize return r >> shift; } } From 243ee2eb20cda610cd8026619830428e7d03dcf5 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 09:45:01 +0100 Subject: [PATCH 14/38] Cleanup --- src/utils/512Math.sol | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index d0f35ec02..ed58b25c1 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1838,7 +1838,7 @@ library Lib512MathArithmetic { // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we // store in `r_hi`. `res` is the residue after this first operation and `d` is the - // derivative/denominator for the subsequent outer Newton step of the Karatsuba "loop". + // derivative/denominator for the subsequent Karatsuba step. uint256 r_hi; uint256 res; uint256 d; @@ -1890,14 +1890,13 @@ library Lib512MathArithmetic { } } - // Second-order correction for the omitted quadratic term in the first-limb lift. - // With B = 2^86, q := r_lo, and a := r_hi: - // q' = q - floor(q^2 / (a * B)). - // This removes almost all of the overshoot from the first-order estimate. + // Unlike the square-root case, the error from the linear Karatsuba step can still be + // large because the expansion has more terms. We do a quadratic correction to get close + // enough that the single subtraction is sufficient for exactness. r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86); r = (r_hi << 86) + r_lo; - // Exact floor correction. + // Our error is now down to 1ulp. Check if `r` overshoots and correct. assembly ("memory-safe") { let mm0 := mulmod(r, r, not(0x00)) let r2_lo := mul(r, r) @@ -1905,10 +1904,7 @@ library Lib512MathArithmetic { let mm1 := mulmod(r2_lo, r, not(0x00)) let lo := mul(r2_lo, r) - let hi_lo := sub(sub(mm1, lo), lt(mm1, lo)) - - let cross_lo := mul(r2_hi, r) - let hi := add(hi_lo, cross_lo) + let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) } From 2b0fb15fe9ca3cf5761fe1073fe717360b6ef42c Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 10:16:18 +0100 Subject: [PATCH 15/38] Add a fuzz test to explicitly exercise a known weak-point of some `cbrt` implementations --- test/0.8.25/512Math.t.sol | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index af23c317a..8b6e5b5b0 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -437,6 +437,12 @@ contract Lib512MathTest is Test { } } + function test512Math_cbrt_perfectCube(uint256 r) external pure { + r = bound(r, 1, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628); + uint512 x = alloc().omul(r, r).imul(r); + assertEq(x.cbrt(), r); + } + function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure { s = bound(s, 0, 512); From 3a4e2882ba46ae2eb8e2a079529676908d5fb407 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 10:43:54 +0100 Subject: [PATCH 16/38] WIP: 512-bit cbrtUp --- test/0.8.25/512Math.t.sol | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index 8b6e5b5b0..2d4a4df2f 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -443,6 +443,33 @@ contract Lib512MathTest is Test { assertEq(x.cbrt(), r); } + function test512Math_cbrtUp(uint256 x_hi, uint256 x_lo) external pure { + uint512 x = alloc().from(x_hi, x_lo); + uint256 r = x.cbrtUp(); + uint512 r3 = alloc().omul(r, r).imul(r); + + if ( + x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d7a00) + ) { + assertEq(r, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885629, "cbrtUp overflow"); + } else { + assertTrue(r3 >= x, "cbrtUp too low"); + } + if (x_hi != 0 && x_lo != 0) { + r--; + r3.omul(r, r).imul(r); + assertTrue(r3 < x, "cbrtUp too high"); + } + } + + function test512Math_cbrtUp_perfectCube(uint256 r) external pure { + r = bound(r, 1, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628); + uint512 x = alloc().omul(r, r).imul(r); + assertEq(x.cbrtUp(), r); + } + function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure { s = bound(s, 0, 512); From d1fedff4a492a121c91465ca65998bfdf293e4b8 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 10:44:32 +0100 Subject: [PATCH 17/38] `cbrtUp` --- src/utils/512Math.sol | 58 ++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index ed58b25c1..9cc308f29 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -184,6 +184,7 @@ WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING /// ### Cube root /// /// * cbrt(uint512) returns (uint256) +/// * cbrtUp(uint512) returns (uint256) /// /// ### Shifting /// @@ -1895,33 +1896,60 @@ library Lib512MathArithmetic { // enough that the single subtraction is sufficient for exactness. r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86); r = (r_hi << 86) + r_lo; - - // Our error is now down to 1ulp. Check if `r` overshoots and correct. - assembly ("memory-safe") { - let mm0 := mulmod(r, r, not(0x00)) - let r2_lo := mul(r, r) - let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) - - let mm1 := mulmod(r2_lo, r, not(0x00)) - let lo := mul(r2_lo, r) - let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) - - r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) - } + // Our error is now down to 1ulp. // Un-normalize return r >> shift; } } - function cbrt(uint512 x) internal pure returns (uint256) { + function cbrt(uint512 x) internal pure returns (uint256 r) { (uint256 x_hi, uint256 x_lo) = x.into(); if (x_hi == 0) { return x_lo.cbrt(); } - return _cbrt(x_hi, x_lo); + r = _cbrt(x_hi, x_lo); + + // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct. + assembly ("memory-safe") { + let mm0 := mulmod(r, r, not(0x00)) + let r2_lo := mul(r, r) + let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + + let mm1 := mulmod(r2_lo, r, not(0x00)) + let lo := mul(r2_lo, r) + let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + + r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) + } + } + + function cbrtUp(uint512 x) internal pure returns (uint256 r) { + (uint256 x_hi, uint256 x_lo) = x.into(); + + if (x_hi == 0) { + r = x_lo.cbrt(); + unchecked { + r = r.unsafeInc(r * r * r < x_lo); + } + } else { + r = _cbrt(x_hi, x_lo); + + // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. + assembly ("memory-safe") { + let mm0 := mulmod(r, r, not(0x00)) + let r2_lo := mul(r, r) + let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + + let mm1 := mulmod(r2_lo, r, not(0x00)) + let lo := mul(r2_lo, r) + let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + + r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo)))) + } + } } function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) { From 09a9018cf8bc570235401c3d879e37fe63ccb5f3 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 11:00:47 +0100 Subject: [PATCH 18/38] Golf --- src/utils/512Math.sol | 5 +---- src/vendor/Cbrt.sol | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 9cc308f29..81bb596bf 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1930,10 +1930,7 @@ library Lib512MathArithmetic { (uint256 x_hi, uint256 x_lo) = x.into(); if (x_hi == 0) { - r = x_lo.cbrt(); - unchecked { - r = r.unsafeInc(r * r * r < x_lo); - } + r = x_lo.cbrtUp(); } else { r = _cbrt(x_hi, x_lo); diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol index 3604c919c..e1465ba2a 100644 --- a/src/vendor/Cbrt.sol +++ b/src/vendor/Cbrt.sol @@ -8,7 +8,7 @@ library Cbrt { /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy /// Formally verified by xuwinnie: /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf - function cbrt(uint256 x) internal pure returns (uint256 z) { + function _cbrt(uint256 x) private pure returns (uint256 z) { assembly ("memory-safe") { // Initial guess z = 2^ceil((log2(x) + 2) / 3). // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1) @@ -22,8 +22,22 @@ library Cbrt { z := div(add(add(div(x, mul(z, z)), z), z), 3) z := div(add(add(div(x, mul(z, z)), z), z), 3) z := div(add(add(div(x, mul(z, z)), z), z), 3) + } + } + + function cbrt(uint256 x) internal pure returns (uint256 z) { + z = _cbrt(x); + assembly ("memory-safe") { // Round down. z := sub(z, lt(div(x, mul(z, z)), z)) } } + + function cbrtUp(uint256 x) internal pure returns (uint256 z) { + z = _cbrt(x); + assembly ("memory-safe") { + // Round up. + z := add(z, lt(mul(z, mul(z, z)), x)) + } + } } From 9a361bd00a6a68f44278c05113e065600abd2cd7 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 11:07:40 +0100 Subject: [PATCH 19/38] Improve test strictness --- test/0.8.25/512Math.t.sol | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index 2d4a4df2f..037d972d4 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -428,9 +428,13 @@ contract Lib512MathTest is Test { uint512 r3 = alloc().omul(r, r).imul(r); assertTrue(r3 <= x, "cbrt too high"); - if (!(x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f - || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f - && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff))) { + if ( + x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f + && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff) + ) { + assertEq(r, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628, "cbrt overflow"); + } else { r++; r3.omul(r, r).imul(r); assertTrue(r3 > x, "cbrt too low"); From d0959f67a777d02bf5175a4d8d8bcb6138fd5b92 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 11:10:49 +0100 Subject: [PATCH 20/38] Improve test strictness --- test/0.8.25/512Math.t.sol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index 037d972d4..2134d15ff 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -461,7 +461,7 @@ contract Lib512MathTest is Test { } else { assertTrue(r3 >= x, "cbrtUp too low"); } - if (x_hi != 0 && x_lo != 0) { + if (x_hi != 0 || x_lo != 0) { r--; r3.omul(r, r).imul(r); assertTrue(r3 < x, "cbrtUp too high"); From a5a8fcefae182fad61f082384bcf5f4d5c82c714 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 11:41:52 +0100 Subject: [PATCH 21/38] Comment --- src/utils/512Math.sol | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 81bb596bf..61d021b7a 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1826,10 +1826,10 @@ library Lib512MathArithmetic { (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3); // Zimmerman's "Karatsuba Square Root" algorithm works with limbs of `r` that are half - // of a word. For cube root, we use limbs of `r` that are one third of a word. The - // initial step to compute the first "limb" of `r` uses the "normal" cube root algorithm - // and consumes the first (almost) word of `x`. The second and final limb of `r` is - // computed using an analogue of the Karatsuba step from the original algorithm, + // of a word. For cube root, we use limbs of `r` that are (roughly) one third of a + // word. The initial step to compute the first "limb" of `r` uses the "normal" cube root + // algorithm and consumes the first (almost) word of `x`. The second and final limb of + // `r` is computed using an analogue of the Karatsuba step from the original algorithm, // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after // the first whole-ish word. uint256 limb_hi; From c573dab42713a8d311575ed66be4adbca0da240a Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 11:59:04 +0100 Subject: [PATCH 22/38] Comment --- src/utils/512Math.sol | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 61d021b7a..54b94f217 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1872,8 +1872,9 @@ library Lib512MathArithmetic { // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost): // r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋ - // res = (res ⋅ 2⁸⁶ + limb_hi) % (3 ⋅ r_hi²) - // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root step. + // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root + // step. We discard `res` after this step and perform a quadratic correction instead of + // the underflow check from Zimmerman uint256 r_lo; assembly ("memory-safe") { let n := or(shl(0x56, res), limb_hi) @@ -1894,6 +1895,11 @@ library Lib512MathArithmetic { // Unlike the square-root case, the error from the linear Karatsuba step can still be // large because the expansion has more terms. We do a quadratic correction to get close // enough that the single subtraction is sufficient for exactness. + // + // In the square-root version, the only ignored term in (s + q)² is q², which is small + // enough for a 1ulp correction. For cube root, the binomial expansion (r_hi·2⁸⁶ + + // r_lo)³ contains the cross term 3·(r_hi·2⁸⁶)·r_lo². The linear Karatsuba step + // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only r_lo³ r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86); r = (r_hi << 86) + r_lo; // Our error is now down to 1ulp. From 67cc8e4d6a10b2c23d31e42574ae41e56179fb74 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:04:31 +0100 Subject: [PATCH 23/38] Consistency --- src/utils/512Math.sol | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 54b94f217..c15ae36cb 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1936,22 +1936,22 @@ library Lib512MathArithmetic { (uint256 x_hi, uint256 x_lo) = x.into(); if (x_hi == 0) { - r = x_lo.cbrtUp(); - } else { - r = _cbrt(x_hi, x_lo); + return x_lo.cbrtUp(); + } - // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. - assembly ("memory-safe") { - let mm0 := mulmod(r, r, not(0x00)) - let r2_lo := mul(r, r) - let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + r = _cbrt(x_hi, x_lo); - let mm1 := mulmod(r2_lo, r, not(0x00)) - let lo := mul(r2_lo, r) - let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. + assembly ("memory-safe") { + let mm0 := mulmod(r, r, not(0x00)) + let r2_lo := mul(r, r) + let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) - r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo)))) - } + let mm1 := mulmod(r2_lo, r, not(0x00)) + let lo := mul(r2_lo, r) + let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + + r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo)))) } } From a1c9d47a7f21db2fd7c6b8cb8a5cb4c590e2e18f Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:10:43 +0100 Subject: [PATCH 24/38] Comment --- src/utils/512Math.sol | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index c15ae36cb..176e95ae0 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1899,7 +1899,8 @@ library Lib512MathArithmetic { // In the square-root version, the only ignored term in (s + q)² is q², which is small // enough for a 1ulp correction. For cube root, the binomial expansion (r_hi·2⁸⁶ + // r_lo)³ contains the cross term 3·(r_hi·2⁸⁶)·r_lo². The linear Karatsuba step - // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only r_lo³ + // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only the r_lo³ + // term, on the order of 2²⁵⁸/(3·2³⁴²), much less than 1ulp. r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86); r = (r_hi << 86) + r_lo; // Our error is now down to 1ulp. From 96732f446f9dd823ec62dd25ae1007ac597a51b2 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:26:41 +0100 Subject: [PATCH 25/38] Style --- src/utils/512Math.sol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 176e95ae0..eceb362b0 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1844,7 +1844,7 @@ library Lib512MathArithmetic { uint256 res; uint256 d; assembly ("memory-safe") { - let w := shr(2, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization + let w := shr(0x02, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization r_hi := 0x1000000000000000000000 // Given `w` in its range, this seed is suitable r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03) From 2cd9eccd37e743aa8b14846dd9d6ebd8fd3a0a67 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:28:34 +0100 Subject: [PATCH 26/38] Style --- src/utils/512Math.sol | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index eceb362b0..4a9c88f1c 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1830,12 +1830,7 @@ library Lib512MathArithmetic { // word. The initial step to compute the first "limb" of `r` uses the "normal" cube root // algorithm and consumes the first (almost) word of `x`. The second and final limb of // `r` is computed using an analogue of the Karatsuba step from the original algorithm, - // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after - // the first whole-ish word. - uint256 limb_hi; - assembly ("memory-safe") { - limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo)) - } + // followed by a pair of cleanup steps. // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we // store in `r_hi`. `res` is the residue after this first operation and `d` is the @@ -1870,6 +1865,11 @@ library Lib512MathArithmetic { d := mul(0x03, r_hi_sq) } + // `limb_hi` is the next 86-bit limb of `x` after the first whole-ish word. + uint256 limb_hi; + assembly ("memory-safe") { + limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo)) + } // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost): // r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋ // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root From 0eecce176d2510cdd368284c1325d89ac9e955cf Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:30:58 +0100 Subject: [PATCH 27/38] Style --- src/utils/512Math.sol | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 4a9c88f1c..ff21e6b6c 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1761,9 +1761,8 @@ library Lib512MathArithmetic { // It's possible that `n` was 257 bits and overflowed (`res` was not just a single // limb). Explicitly handling the carry avoids 512-bit division. if c { - let neg_c := not(0x00) - r_lo := add(r_lo, div(neg_c, d)) - res := add(res, add(0x01, mod(neg_c, d))) + r_lo := add(r_lo, div(not(0x00), d)) + res := add(res, add(0x01, mod(not(0x00), d))) r_lo := add(r_lo, div(res, d)) res := mod(res, d) } @@ -1884,10 +1883,9 @@ library Lib512MathArithmetic { // If `res` was 171 bits (one more than expected), then `n` overflowed to 257 // bits. Explicitly handling the carry avoids 512-bit division. if c { - let neg_c := not(0x00) let rem := mod(n, d) - r_lo := add(r_lo, div(neg_c, d)) - rem := add(rem, add(0x01, mod(neg_c, d))) + r_lo := add(r_lo, div(not(0x00), d)) + rem := add(rem, add(0x01, mod(not(0x00), d))) r_lo := add(r_lo, div(rem, d)) } } From 55efb2e22629648bfcc6f30d81c2aa7e226c4d62 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 12:32:17 +0100 Subject: [PATCH 28/38] Style --- src/utils/512Math.sol | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index ff21e6b6c..551fa79a9 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1879,10 +1879,9 @@ library Lib512MathArithmetic { let n := or(shl(0x56, res), limb_hi) r_lo := div(n, d) - let c := shr(0xaa, res) // If `res` was 171 bits (one more than expected), then `n` overflowed to 257 // bits. Explicitly handling the carry avoids 512-bit division. - if c { + if shr(0xaa, res) { let rem := mod(n, d) r_lo := add(r_lo, div(not(0x00), d)) rem := add(rem, add(0x01, mod(not(0x00), d))) From 2873a31abe14289f8184dd43ca5103da878476f0 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 16:19:16 +0100 Subject: [PATCH 29/38] Natspec --- src/vendor/Cbrt.sol | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol index e1465ba2a..ccd471dd2 100644 --- a/src/vendor/Cbrt.sol +++ b/src/vendor/Cbrt.sol @@ -3,7 +3,7 @@ pragma solidity ^0.8.25; // @author Modified from Solady by Vectorized and Akshay Tarpara https://github.com/Vectorized/solady/blob/ff6256a18851749e765355b3e21dc9bfa417255b/src/utils/clz/FixedPointMathLib.sol#L799-L822 under the MIT license. library Cbrt { - /// @dev Returns the cube root of `x`, rounded down. + /// @dev Returns the cube root of `x`, rounded to within 1ulp. /// Credit to bout3fiddy and pcaversaccio under AGPLv3 license: /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy /// Formally verified by xuwinnie: @@ -25,6 +25,7 @@ library Cbrt { } } + /// @dev Returns the cube root of `x`, rounded down. function cbrt(uint256 x) internal pure returns (uint256 z) { z = _cbrt(x); assembly ("memory-safe") { @@ -33,6 +34,7 @@ library Cbrt { } } + /// @dev Returns the cube root of `x`, rounded up. function cbrtUp(uint256 x) internal pure returns (uint256 z) { z = _cbrt(x); assembly ("memory-safe") { From e8fe93335adc1c36818dea2d5696b3c6a4df4cdd Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 18:23:46 +0100 Subject: [PATCH 30/38] Document overflow safety of cbrt/cbrtUp cube-and-compare MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add detailed comments explaining why _cbrt never returns r_max + 1 in the overflow regime (x > r_max^3), which would cause the 256-bit cube-and-compare to silently produce wrong results. The argument traces through the EVM execution: the truncated div(n, d) and carry-adjusted r_lo_pre are both constant across the regime because res varies by only ~0.62·2^83 and the quotient's fractional part stays in [0.118, 0.283]. Add fuzz test targeting the overflow-cube regime specifically. Co-Authored-By: Claude Opus 4.6 --- src/utils/512Math.sol | 34 ++++++++++++++++++++++++++++++++++ test/0.8.25/512Math.t.sol | 24 ++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 551fa79a9..b98789efe 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1916,6 +1916,38 @@ library Lib512MathArithmetic { r = _cbrt(x_hi, x_lo); + // Detailed note for the top-of-range regime: + // Let + // r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628 + // and consider x in [r_max^3 + 1, 2^512 - 1]. In this regime, cubing r_max + 1 would + // overflow 512 bits, but `_cbrt` still returns r_max (never r_max + 1): + // 1) shift = clz(x_hi) / 3 = 0 for all such x. + // 2) w = x_hi >> 2 lies in [0x3f..fb0959fdf442978718ddcb, 2^254 - 1]. + // 3) In that full interval, floor(cbrt(w)) is constant: + // r_hi = 0x1965fea53d6e3c82b05999 + // because r_hi^3 <= w < (r_hi + 1)^3. + // 4) Therefore d = 3 * r_hi^2 is constant: + // d = 0x78f3d1d950af414cd731fe48f48fde1309821333853. + // 5) In the EVM code, n = shl(86, res) | limb_hi truncates to 256 bits (res is + // 171 bits, so res << 86 is 257 bits). The truncated div(n, d) is constant: + // div(n, d) = 0x8f3a38c7f3364c49d3405 for all x in the regime. + // The carry branch (shr(0xaa, res) != 0) fires for the entire regime since + // res_min = 0x50ea... is already 171 bits. After carry adjustment, r_lo_pre + // is constant (see step 6). Both the pre-carry and post-carry quotients stay + // in one bucket because res varies by only ~0.62·2^83 across the regime, + // and limb_hi's full 86-bit range contributes < 2^{-84} to n/d (negligible + // vs d ≈ 2^170.8). Total swing in the continuous quotient is ~0.164. + // At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max^3) and ≈ 0.283 (at + // x = 2^512 - 1), so the floor never crosses an integer boundary. + // 6) After the carry adjustment branch, r_lo_pre is constant: + // r_lo_pre = 0x2ad0f7137bc6601d885629. + // 7) The quadratic correction subtracts exactly 1: + // floor(r_lo_pre^2 / (r_hi << 86)) = 1 + // so r_lo = 0x2ad0f7137bc6601d885628 and + // r = (r_hi << 86) + r_lo = r_max. + // + // So for this regime, the cube-and-compare code below only cubes r_max (which fits in + // 512 bits). `cbrtUp` reaches r_max + 1 only via its final +1 correction. // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct. assembly ("memory-safe") { let mm0 := mulmod(r, r, not(0x00)) @@ -1939,6 +1971,8 @@ library Lib512MathArithmetic { r = _cbrt(x_hi, x_lo); + // See the detailed overflow-regime note in `cbrt` above. In particular, near 2^512, + // `_cbrt` is pinned at r_max and does not return r_max + 1 directly. // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. assembly ("memory-safe") { let mm0 := mulmod(r, r, not(0x00)) diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol index 2134d15ff..5ad0fc4b0 100644 --- a/test/0.8.25/512Math.t.sol +++ b/test/0.8.25/512Math.t.sol @@ -447,6 +447,30 @@ contract Lib512MathTest is Test { assertEq(x.cbrt(), r); } + function test512Math_cbrt_overflowCubeRegime(uint256 x_hi, uint256 x_lo) external pure { + uint256 r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628; + uint256 r_max_plus_one = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885629; + uint256 r_max_cube_hi = 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f; + uint256 r_max_cube_lo = 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d7a00; + + // Force x > r_max^3 so cbrtUp(x) must return r_max + 1, whose cube is 513 bits. + // + // Why this still passes with the current implementation: + // `_cbrt` is returning `r_max` (not `r_max + 1`) in this regime. If `_cbrt` returned + // `r_max + 1`, then `cbrt` would have to decrement based on an overflowed cube-and-compare + // and this assertion would fail for these near-2^512 inputs. Because `cbrt` stays equal to + // `r_max`, both cube-and-compare paths only cube `r_max` (which fits in 512 bits), and + // `cbrtUp` reaches `r_max + 1` only via its final `+1` correction. + x_hi = bound(x_hi, r_max_cube_hi, type(uint256).max); + if (x_hi == r_max_cube_hi) { + x_lo = bound(x_lo, r_max_cube_lo + 1, type(uint256).max); + } + + uint512 x = alloc().from(x_hi, x_lo); + assertEq(x.cbrt(), r_max, "cbrt in overflow-cube regime"); + assertEq(x.cbrtUp(), r_max_plus_one, "cbrtUp in overflow-cube regime"); + } + function test512Math_cbrtUp(uint256 x_hi, uint256 x_lo) external pure { uint512 x = alloc().from(x_hi, x_lo); uint256 r = x.cbrtUp(); From cd2f23495aaea7abe93e21977f7840e2d7707349 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 18:59:42 +0100 Subject: [PATCH 31/38] Fix AI comments --- src/utils/512Math.sol | 66 +++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index b98789efe..48a566c8a 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1916,39 +1916,43 @@ library Lib512MathArithmetic { r = _cbrt(x_hi, x_lo); - // Detailed note for the top-of-range regime: - // Let + // The following cube-and-compare technique for obtaining the floor appears, at first, to + // have an overflow bug in it. Consider that `_cbrt` returns a value within 1ulp of the + // correct value. Define: // r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628 - // and consider x in [r_max^3 + 1, 2^512 - 1]. In this regime, cubing r_max + 1 would - // overflow 512 bits, but `_cbrt` still returns r_max (never r_max + 1): - // 1) shift = clz(x_hi) / 3 = 0 for all such x. - // 2) w = x_hi >> 2 lies in [0x3f..fb0959fdf442978718ddcb, 2^254 - 1]. - // 3) In that full interval, floor(cbrt(w)) is constant: - // r_hi = 0x1965fea53d6e3c82b05999 - // because r_hi^3 <= w < (r_hi + 1)^3. - // 4) Therefore d = 3 * r_hi^2 is constant: - // d = 0x78f3d1d950af414cd731fe48f48fde1309821333853. - // 5) In the EVM code, n = shl(86, res) | limb_hi truncates to 256 bits (res is - // 171 bits, so res << 86 is 257 bits). The truncated div(n, d) is constant: - // div(n, d) = 0x8f3a38c7f3364c49d3405 for all x in the regime. - // The carry branch (shr(0xaa, res) != 0) fires for the entire regime since - // res_min = 0x50ea... is already 171 bits. After carry adjustment, r_lo_pre - // is constant (see step 6). Both the pre-carry and post-carry quotients stay - // in one bucket because res varies by only ~0.62·2^83 across the regime, - // and limb_hi's full 86-bit range contributes < 2^{-84} to n/d (negligible - // vs d ≈ 2^170.8). Total swing in the continuous quotient is ~0.164. - // At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max^3) and ≈ 0.283 (at - // x = 2^512 - 1), so the floor never crosses an integer boundary. - // 6) After the carry adjustment branch, r_lo_pre is constant: - // r_lo_pre = 0x2ad0f7137bc6601d885629. + // this means that for values of x in [r_max³ + 1, 2⁵¹² - 1], `_cbrt` could return r_max + + // 1, which would result in overflow when cubing `r`. However, this does not happen. Given + // `x` in the specified range, the `_cbrt` follows the steps below: + // + // 1) shift = clz(x_hi) / 3 = 0 + // 2) w = x_hi >> 2 lies in [0x3fff..fffb0959fdf442978718ddcb, 2²⁵⁴ - 1] + // 3) In that full interval, ⌊∛w⌋ is constant. For r_hi, we get: + // after 6 Newton-Raphson iterations: r_hi = 0x1965fea53d6e3c82b0c310 + // which forces a 7th iteration + // after the branch is taken: r_hi = 0x1965fea53d6e3c82b05999 + // 4) Therefore d = 3 ⋅ r_hi² is constant: + // d = 0x78f3d1d950af414cd731fe48f48fde1309821333853 + // 5) n = (res << 86) | limb_hi overflows and is truncated to 256 bits. The truncated ⌊n / d⌋ + // is constant: + // ⌊n / d⌋ = 0x8f3a38c7f3364c49d3405 + // The carry branch (res >> 170 != 0) fires. The carry adjustment modifies the truncated + // quotient by adding: + // ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224 + // and the final carry refinement term is zero, giving: + // r_lo = 0x2ad0f7137bc6601d885629 + // The quotient stays in one "bucket" because `res` varies by only ~0.62·2⁸³, and + // `limb_hi`'s full 86-bit range contributes <1/2⁸⁴ to n/d. Total swing in the continuous + // quotient is ~0.164. At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³ + 1) and + // ≈0.283 (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary + // 6) After the carry adjustment branch, `r_lo` is constant: + // r_lo = 0x2ad0f7137bc6601d885629 // 7) The quadratic correction subtracts exactly 1: - // floor(r_lo_pre^2 / (r_hi << 86)) = 1 + // ⌊r_lo² / (r_hi·2⁸⁶)⌋ = 1 // so r_lo = 0x2ad0f7137bc6601d885628 and - // r = (r_hi << 86) + r_lo = r_max. + // r = r_hi·2⁸⁶ + r_lo = r_max // - // So for this regime, the cube-and-compare code below only cubes r_max (which fits in - // 512 bits). `cbrtUp` reaches r_max + 1 only via its final +1 correction. - // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct. + // So, the cube-and-compare code below only cubes a value of at most `r_max`, which fits in + // 512 bits. `cbrtUp` reaches `r_max + 1` only via its final +1 correction assembly ("memory-safe") { let mm0 := mulmod(r, r, not(0x00)) let r2_lo := mul(r, r) @@ -1971,10 +1975,10 @@ library Lib512MathArithmetic { r = _cbrt(x_hi, x_lo); - // See the detailed overflow-regime note in `cbrt` above. In particular, near 2^512, - // `_cbrt` is pinned at r_max and does not return r_max + 1 directly. // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. assembly ("memory-safe") { + // See the detailed overflow-regime note in `cbrt` above. In particular, near 2⁵¹², + // `_cbrt` is pinned at `r_max` and does not return `r_max + 1` directly. let mm0 := mulmod(r, r, not(0x00)) let r2_lo := mul(r, r) let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) From 126ed9489ce4724d63ebb9a5af4fd597fb03c9b5 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Tue, 24 Feb 2026 19:18:45 +0100 Subject: [PATCH 32/38] Pedantry --- src/utils/512Math.sol | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index 48a566c8a..b1c62df9b 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1920,13 +1920,13 @@ library Lib512MathArithmetic { // have an overflow bug in it. Consider that `_cbrt` returns a value within 1ulp of the // correct value. Define: // r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628 - // this means that for values of x in [r_max³ + 1, 2⁵¹² - 1], `_cbrt` could return r_max + - // 1, which would result in overflow when cubing `r`. However, this does not happen. Given - // `x` in the specified range, the `_cbrt` follows the steps below: + // this means that for values of x in [r_max³, 2⁵¹² - 1], `_cbrt` could return r_max + 1, + // which would result in overflow when cubing `r`. However, this does not happen. Given `x` + // in the specified range, the `_cbrt` follows the steps below: // - // 1) shift = clz(x_hi) / 3 = 0 + // 1) shift = ⌊clz(x_hi) / 3⌋ = 0 // 2) w = x_hi >> 2 lies in [0x3fff..fffb0959fdf442978718ddcb, 2²⁵⁴ - 1] - // 3) In that full interval, ⌊∛w⌋ is constant. For r_hi, we get: + // 3) In that full interval, ⌊∛w⌋ is constant. For `r_hi`, we get: // after 6 Newton-Raphson iterations: r_hi = 0x1965fea53d6e3c82b0c310 // which forces a 7th iteration // after the branch is taken: r_hi = 0x1965fea53d6e3c82b05999 @@ -1937,13 +1937,13 @@ library Lib512MathArithmetic { // ⌊n / d⌋ = 0x8f3a38c7f3364c49d3405 // The carry branch (res >> 170 != 0) fires. The carry adjustment modifies the truncated // quotient by adding: - // ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224 + // ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224 // and the final carry refinement term is zero, giving: // r_lo = 0x2ad0f7137bc6601d885629 // The quotient stays in one "bucket" because `res` varies by only ~0.62·2⁸³, and // `limb_hi`'s full 86-bit range contributes <1/2⁸⁴ to n/d. Total swing in the continuous - // quotient is ~0.164. At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³ + 1) and - // ≈0.283 (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary + // quotient is ~0.164. At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³) and ≈0.283 + // (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary // 6) After the carry adjustment branch, `r_lo` is constant: // r_lo = 0x2ad0f7137bc6601d885629 // 7) The quadratic correction subtracts exactly 1: From 3574b35ec07e4b62e44bcb18c8a0400646157d7f Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Wed, 25 Feb 2026 12:10:57 +0100 Subject: [PATCH 33/38] Bug! Avoid overflow in `Cbrt.cbrtUp` --- src/vendor/Cbrt.sol | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol index ccd471dd2..32d244444 100644 --- a/src/vendor/Cbrt.sol +++ b/src/vendor/Cbrt.sol @@ -38,8 +38,10 @@ library Cbrt { function cbrtUp(uint256 x) internal pure returns (uint256 z) { z = _cbrt(x); assembly ("memory-safe") { - // Round up. - z := add(z, lt(mul(z, mul(z, z)), x)) + // Round up. Avoid cubing `z` to avoid overflow + let z2 := mul(z, z) + let d := div(x, z2) + z := add(z, gt(add(d, lt(mul(d, z2), x)), z)) } } } From 817bb55faafe39270680872b32813d3408e91caa Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Wed, 25 Feb 2026 12:11:51 +0100 Subject: [PATCH 34/38] Add some tests for `cbrtUp` and `sqrtUp` --- test/0.8.25/Cbrt.t.sol | 28 ++++++++++++++++++++++++++++ test/0.8.25/Sqrt.t.sol | 28 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 test/0.8.25/Cbrt.t.sol create mode 100644 test/0.8.25/Sqrt.t.sol diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol new file mode 100644 index 000000000..a07dffebf --- /dev/null +++ b/test/0.8.25/Cbrt.t.sol @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.25; + +import {Cbrt} from "src/vendor/Cbrt.sol"; +import {Test} from "@forge-std/Test.sol"; + +contract CbrtTest is Test { + using Cbrt for uint256; + + uint256 private constant _CBRT_FLOOR_MAX_UINT256 = 0x285145f31ae515c447bb56; + uint256 private constant _CBRT_CEIL_MAX_UINT256 = 0x285145f31ae515c447bb57; + uint256 private constant _CBRT_FLOOR_MAX_UINT256_CUBE = + 0xffffffffffffffffffffef214b5539a2d22f71387253e480168f34c9da3f5898; + + function testCbrtUp_overflowCubeRange(uint256 x) external pure { + x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max); + + assertEq(x.cbrt(), _CBRT_FLOOR_MAX_UINT256, "cbrt overflow-cube range"); + assertEq(x.cbrtUp(), _CBRT_CEIL_MAX_UINT256, "cbrtUp overflow-cube range"); + } + + function testCbrtUp_overflowCubeBoundary() external pure { + uint256 x = _CBRT_FLOOR_MAX_UINT256_CUBE; + + assertEq(x.cbrt(), _CBRT_FLOOR_MAX_UINT256, "cbrt boundary"); + assertEq(x.cbrtUp(), _CBRT_FLOOR_MAX_UINT256, "cbrtUp boundary"); + } +} diff --git a/test/0.8.25/Sqrt.t.sol b/test/0.8.25/Sqrt.t.sol new file mode 100644 index 000000000..1a8a085ed --- /dev/null +++ b/test/0.8.25/Sqrt.t.sol @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +pragma solidity ^0.8.25; + +import {Sqrt} from "src/vendor/Sqrt.sol"; +import {Test} from "@forge-std/Test.sol"; + +contract SqrtTest is Test { + using Sqrt for uint256; + + uint256 private constant _SQRT_FLOOR_MAX_UINT256 = type(uint128).max; + uint256 private constant _SQRT_CEIL_MAX_UINT256 = uint256(1) << 128; + uint256 private constant _SQRT_FLOOR_MAX_UINT256_SQUARED = + 0xfffffffffffffffffffffffffffffffe00000000000000000000000000000001; + + function testSqrtUp_overflowSquareRange(uint256 x) external pure { + x = bound(x, _SQRT_FLOOR_MAX_UINT256_SQUARED + 1, type(uint256).max); + + assertEq(x.sqrt(), _SQRT_FLOOR_MAX_UINT256, "sqrt overflow-square range"); + assertEq(x.sqrtUp(), _SQRT_CEIL_MAX_UINT256, "sqrtUp overflow-square range"); + } + + function testSqrtUp_overflowSquareBoundary() external pure { + uint256 x = _SQRT_FLOOR_MAX_UINT256_SQUARED; + + assertEq(x.sqrt(), _SQRT_FLOOR_MAX_UINT256, "sqrt boundary"); + assertEq(x.sqrtUp(), _SQRT_FLOOR_MAX_UINT256, "sqrtUp boundary"); + } +} From f061574491bef0af09f1fb1afb57bcad7c653011 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Wed, 25 Feb 2026 16:48:06 +0100 Subject: [PATCH 35/38] Add more tests --- test/0.8.25/Cbrt.t.sol | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol index a07dffebf..6c439b934 100644 --- a/test/0.8.25/Cbrt.t.sol +++ b/test/0.8.25/Cbrt.t.sol @@ -12,6 +12,32 @@ contract CbrtTest is Test { uint256 private constant _CBRT_FLOOR_MAX_UINT256_CUBE = 0xffffffffffffffffffffef214b5539a2d22f71387253e480168f34c9da3f5898; + function testCbrt(uint256 x) external pure { + uint256 r = x.cbrt(); + assertLe(r * r * r, x, "cbrt too high"); + if (x < _CBRT_FLOOR_MAX_UINT256_CUBE) { + r++; + assertGt(r * r * r, x, "cbrt too low"); + } else { + assertEq(r, _CBRT_FLOOR_MAX_UINT256, "cbrt overflow"); + } + } + + function testCbrtUp(uint256 x) external pure { + uint256 r = x.cbrtUp(); + if (x <= _CBRT_FLOOR_MAX_UINT256_CUBE) { + assertGe(r * r * r, x, "cbrtUp too low"); + } else { + assertEq(r, _CBRT_CEIL_MAX_UINT256, "cbrtUp overflow"); + } + if (x != 0) { + r--; + assertLt(r * r * r, x, "cbrtUp too high"); + } else { + assertEq(r, 0, "cbrtUp underflow"); + } + } + function testCbrtUp_overflowCubeRange(uint256 x) external pure { x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max); From a131f5584cc70550cca7b4acf5485d8438810fcf Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Wed, 25 Feb 2026 16:49:31 +0100 Subject: [PATCH 36/38] Add more tests --- test/0.8.25/Sqrt.t.sol | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/0.8.25/Sqrt.t.sol b/test/0.8.25/Sqrt.t.sol index 1a8a085ed..f9c051113 100644 --- a/test/0.8.25/Sqrt.t.sol +++ b/test/0.8.25/Sqrt.t.sol @@ -12,6 +12,32 @@ contract SqrtTest is Test { uint256 private constant _SQRT_FLOOR_MAX_UINT256_SQUARED = 0xfffffffffffffffffffffffffffffffe00000000000000000000000000000001; + function testSqrt(uint256 x) external pure { + uint256 r = x.sqrt(); + assertLe(r * r, x, "sqrt too high"); + if (x < _SQRT_FLOOR_MAX_UINT256_SQUARED) { + r++; + assertGt(r * r, x, "sqrt too low"); + } else { + assertEq(r, _SQRT_FLOOR_MAX_UINT256, "sqrt overflow"); + } + } + + function testSqrtUp(uint256 x) external pure { + uint256 r = x.sqrtUp(); + if (x <= _SQRT_FLOOR_MAX_UINT256_SQUARED) { + assertGe(r * r, x, "sqrtUp too low"); + } else { + assertEq(r, _SQRT_CEIL_MAX_UINT256, "sqrtUp overflow"); + } + if (x != 0) { + r--; + assertLt(r * r, x, "sqrtUp too high"); + } else { + assertEq(r, 0, "sqrtUp underflow"); + } + } + function testSqrtUp_overflowSquareRange(uint256 x) external pure { x = bound(x, _SQRT_FLOOR_MAX_UINT256_SQUARED + 1, type(uint256).max); From 98adb95b127650fd59df26833b23b37859853ed6 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Wed, 25 Feb 2026 16:49:50 +0100 Subject: [PATCH 37/38] `forge fmt` --- test/0.8.25/Cbrt.t.sol | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol index 6c439b934..189d3bd4c 100644 --- a/test/0.8.25/Cbrt.t.sol +++ b/test/0.8.25/Cbrt.t.sol @@ -37,7 +37,7 @@ contract CbrtTest is Test { assertEq(r, 0, "cbrtUp underflow"); } } - + function testCbrtUp_overflowCubeRange(uint256 x) external pure { x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max); From 6e0df453af8deb7bca84c3472211f234d33bf6f6 Mon Sep 17 00:00:00 2001 From: Duncan Townsend Date: Thu, 26 Feb 2026 11:40:51 +0100 Subject: [PATCH 38/38] Comments --- src/utils/512Math.sol | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol index b1c62df9b..1de99bccc 100644 --- a/src/utils/512Math.sol +++ b/src/utils/512Math.sol @@ -1953,16 +1953,22 @@ library Lib512MathArithmetic { // // So, the cube-and-compare code below only cubes a value of at most `r_max`, which fits in // 512 bits. `cbrtUp` reaches `r_max + 1` only via its final +1 correction + // + // The following assembly block is identical to + // (uint256 r2_hi, uint256 r2_lo) = _mul(r, r); + // (uint256 r3_hi, uint256 r3_lo) = _mul(r2_hi, r2_lo, r); + // r = r.unsafeDec(_gt(r3_hi, r3_lo, x_hi, x_lo)); + // but is substantially more gas efficient for inexplicable reasons assembly ("memory-safe") { - let mm0 := mulmod(r, r, not(0x00)) + let mm := mulmod(r, r, not(0x00)) let r2_lo := mul(r, r) - let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + let r2_hi := sub(sub(mm, r2_lo), lt(mm, r2_lo)) - let mm1 := mulmod(r2_lo, r, not(0x00)) - let lo := mul(r2_lo, r) - let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + mm := mulmod(r2_lo, r, not(0x00)) + let r3_lo := mul(r2_lo, r) + let r3_hi := add(sub(sub(mm, r3_lo), lt(mm, r3_lo)), mul(r2_hi, r)) - r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo)))) + r := sub(r, or(gt(r3_hi, x_hi), and(eq(r3_hi, x_hi), gt(r3_lo, x_lo)))) } } @@ -1976,18 +1982,24 @@ library Lib512MathArithmetic { r = _cbrt(x_hi, x_lo); // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct. + // + // The following assembly block is identical to + // (uint256 r2_hi, uint256 r2_lo) = _mul(r, r); + // (uint256 r3_hi, uint256 r3_lo) = _mul(r2_hi, r2_lo, r); + // r = r.unsafeInc(_gt(x_hi, x_lo, r3_hi, r3_lo)); + // but is substantially more gas efficient for inexplicable reasons assembly ("memory-safe") { - // See the detailed overflow-regime note in `cbrt` above. In particular, near 2⁵¹², + // See the detailed overflow-regime note in `cbrt` above. In particular, near x = 2⁵¹², // `_cbrt` is pinned at `r_max` and does not return `r_max + 1` directly. - let mm0 := mulmod(r, r, not(0x00)) + let mm := mulmod(r, r, not(0x00)) let r2_lo := mul(r, r) - let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo)) + let r2_hi := sub(sub(mm, r2_lo), lt(mm, r2_lo)) - let mm1 := mulmod(r2_lo, r, not(0x00)) - let lo := mul(r2_lo, r) - let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r)) + mm := mulmod(r2_lo, r, not(0x00)) + let r3_lo := mul(r2_lo, r) + let r3_hi := add(sub(sub(mm, r3_lo), lt(mm, r3_lo)), mul(r2_hi, r)) - r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo)))) + r := add(r, or(lt(r3_hi, x_hi), and(eq(r3_hi, x_hi), lt(r3_lo, x_lo)))) } }