From 8297d2edd1afb970dbe620c89df078aa0286e283 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:17:47 +0100
Subject: [PATCH 01/38] WIP: 512Math cbrt

---
 src/vendor/Cbrt.sol | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 src/vendor/Cbrt.sol

diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol
new file mode 100644
index 000000000..cb0f80e0f
--- /dev/null
+++ b/src/vendor/Cbrt.sol
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+pragma solidity ^0.8.25;
+
+// @author Modified from Solady by Vectorized and Akshay Tarpara https://github.com/Vectorized/solady/blob/ff6256a18851749e765355b3e21dc9bfa417255b/src/utils/clz/FixedPointMathLib.sol#L799-L822 under the MIT license.
+library Cbrt {
+    /// @dev Returns the cube root of `x`, rounded down.
+    /// Credit to bout3fiddy and pcaversaccio under AGPLv3 license:
+    /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy
+    /// Formally verified by xuwinnie:
+    /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf
+    function cbrt(uint256 x) internal pure returns (uint256 z) {
+        /// @solidity memory-safe-assembly
+        assembly ("memory-safe") {
+            // Initial guess z = 2^ceil((log2(x) + 2) / 3).
+            // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1)
+            // computes this over-estimate. Guaranteed ≥ cbrt(x) and safe for Newton-Raphson's.
+            z := shl(div(sub(257, clz(x)), 3), 1)
+            // Newton-Raphson's.
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            z := div(add(add(div(x, mul(z, z)), z), z), 3)
+            // Round down.
+            z := sub(z, lt(div(x, mul(z, z)), z))
+        }
+    }
+}

From 2370b466422eab80462cda8cc306410a4259b422 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:31:10 +0100
Subject: [PATCH 02/38] WIP: 512Math cbrt

---
 test/0.8.25/512Math.t.sol | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index d202c4518..af23c317a 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -422,6 +422,21 @@ contract Lib512MathTest is Test {
         }
     }
 
+    function test512Math_cbrt(uint256 x_hi, uint256 x_lo) external pure {
+        uint512 x = alloc().from(x_hi, x_lo);
+        uint256 r = x.cbrt();
+        uint512 r3 = alloc().omul(r, r).imul(r);
+
+        assertTrue(r3 <= x, "cbrt too high");
+        if (!(x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                    || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                        && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff))) {
+            r++;
+            r3.omul(r, r).imul(r);
+            assertTrue(r3 > x, "cbrt too low");
+        }
+    }
+
     function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure {
         s = bound(s, 0, 512);
 

From f045d800f5178d3188316cb541ca4d235f14723b Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:33:46 +0100
Subject: [PATCH 03/38] WIP: 512Math cbrt

---
 src/utils/512Math.sol | 77 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index c63961635..b86b68deb 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -7,6 +7,7 @@ import {Clz} from "../vendor/Clz.sol";
 import {Ternary} from "./Ternary.sol";
 import {FastLogic} from "./FastLogic.sol";
 import {Sqrt} from "../vendor/Sqrt.sol";
+import {Cbrt} from "../vendor/Cbrt.sol";
 
 /*
 
@@ -180,6 +181,10 @@ WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING
 /// * osqrtUp(uint512,uint512)
 /// * isqrtUp(uint512)
 ///
+/// ### Cube root
+///
+/// * cbrt(uint512) returns (uint256)
+///
 /// ### Shifting
 ///
 /// * oshr(uint512,uint512,uint256)
@@ -358,6 +363,7 @@ library Lib512MathArithmetic {
     using Ternary for bool;
     using FastLogic for bool;
     using Sqrt for uint256;
+    using Cbrt for uint256;
 
     function _add(uint256 x, uint256 y) private pure returns (uint256 r_hi, uint256 r_lo) {
         assembly ("memory-safe") {
@@ -1809,6 +1815,77 @@ library Lib512MathArithmetic {
         return osqrtUp(r, r);
     }
 
+    function _cubeGt(uint256 r, uint256 x_hi, uint256 x_lo) private pure returns (bool isGreater) {
+        (uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
+        assembly ("memory-safe") {
+            let mm := mulmod(r2_lo, r, not(0x00))
+            let lo := mul(r2_lo, r)
+            let hi_lo := sub(sub(mm, lo), lt(mm, lo))
+
+            mm := mulmod(r2_hi, r, not(0x00))
+            let cross_lo := mul(r2_hi, r)
+            let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo))
+
+            let hi := add(hi_lo, cross_lo)
+            let ext := add(cross_hi, lt(hi, hi_lo))
+
+            isGreater := or(ext, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
+        }
+    }
+
+    function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
+        unchecked {
+            // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input
+            // for the top-limb cube root extraction.
+            uint256 shift = x_hi.clz();
+            (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift - shift % 3);
+            shift /= 3;
+
+            // Split x into base B = 2^86 "limbs":
+            //   x = x3 * B^3 + x2 * B^2 + x1 * B + x0
+            // where x3 is 254 bits and x2/x1/x0 are 86-bit limbs.
+            uint256 x3 = x_hi >> 2;
+            uint256 x2;
+            assembly ("memory-safe") {
+                x2 := or(shl(84, and(x_hi, 0x03)), shr(172, x_lo))
+            }
+
+            // First limb from 256-bit cbrt.
+            uint256 r_hi = x3.cbrt();
+            uint256 r_hi_sq = r_hi * r_hi;
+            uint256 res = x3 - r_hi_sq * r_hi;
+            uint256 d = 3 * r_hi_sq;
+
+            // Recover low 86-bit limb from:
+            //   floor((res * B + x2) / (3 * r_hi^2)).
+            uint256 n_hi = res >> 170;
+            uint256 n_lo = (res << 86) | x2;
+            uint256 r_lo = n_hi == 0 ? n_lo / d : _div(n_hi, n_lo, d);
+
+            // Second-order correction for the omitted quadratic term in the first-limb lift.
+            // With B = 2^86, q := r_lo, and a := r_hi:
+            //   q' = q - floor(q^2 / (a * B)).
+            // This removes almost all of the overshoot from the first-order estimate.
+            r_lo -= (r_lo * r_lo) / (r_hi << 86);
+            r = (r_hi << 86) + r_lo;
+
+            // Exact floor correction.
+            r = r.unsafeDec(_cubeGt(r, x_hi, x_lo));
+
+            return r >> shift;
+        }
+    }
+
+    function cbrt(uint512 x) internal pure returns (uint256) {
+        (uint256 x_hi, uint256 x_lo) = x.into();
+
+        if (x_hi == 0) {
+            return x_lo.cbrt();
+        }
+
+        return _cbrt(x_hi, x_lo);
+    }
+
     function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) {
         (uint256 x_hi, uint256 x_lo) = x.into();
         (uint256 r_hi, uint256 r_lo) = _shr(x_hi, x_lo, s);

From 8a7905be7f4183d4b1b6db7a5ac74ab27fdea022 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:44:42 +0100
Subject: [PATCH 04/38] Ignore spam solc error codes

---
 foundry.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/foundry.toml b/foundry.toml
index 4cff5a01d..d86543996 100644
--- a/foundry.toml
+++ b/foundry.toml
@@ -1,5 +1,6 @@
 [profile.default]
 unchecked_cheatcode_artifacts = true
+ignored_error_codes = [2424, 4591]
 
 # https://github.com/foundry-rs/foundry/issues/6780#issuecomment-1962319449
 bytecode_hash = "none"

From 4ef89a66314d2c8bf48dc21c4e0c16e504d2539f Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:45:07 +0100
Subject: [PATCH 05/38] Cleanup

---
 src/utils/512Math.sol | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index b86b68deb..1f6863339 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1696,12 +1696,10 @@ library Lib512MathArithmetic {
     }
 
     function _sqrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
+        /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm
+        /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
+        /// 512Math. This approach is inspired by https://github.com/SimonSuckut/Solidity_Uint512/
         unchecked {
-            /// Our general approach is to apply Zimmerman's "Karatsuba Square Root" algorithm
-            /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
-            /// 512Math. This approach is inspired by
-            /// https://github.com/SimonSuckut/Solidity_Uint512/
-
             // Normalize `x` so the top word has its MSB in bit 255 or 254.
             //   x ≥ 2⁵¹⁰
             uint256 shift = x_hi.clz();
@@ -1815,25 +1813,27 @@ library Lib512MathArithmetic {
         return osqrtUp(r, r);
     }
 
-    function _cubeGt(uint256 r, uint256 x_hi, uint256 x_lo) private pure returns (bool isGreater) {
-        (uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
+    function _cubeGt(uint256 a, uint256 b_hi, uint256 b_lo) private pure returns (bool r) {
+        (uint256 a2_hi, uint256 a2_lo) = _mul(a, a);
         assembly ("memory-safe") {
-            let mm := mulmod(r2_lo, r, not(0x00))
-            let lo := mul(r2_lo, r)
+            let mm := mulmod(a2_lo, a, not(0x00))
+            let lo := mul(a2_lo, a)
             let hi_lo := sub(sub(mm, lo), lt(mm, lo))
 
-            mm := mulmod(r2_hi, r, not(0x00))
-            let cross_lo := mul(r2_hi, r)
+            mm := mulmod(a2_hi, a, not(0x00))
+            let cross_lo := mul(a2_hi, a)
             let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo))
 
             let hi := add(hi_lo, cross_lo)
             let ext := add(cross_hi, lt(hi, hi_lo))
 
-            isGreater := or(ext, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
+            r := or(ext, or(gt(hi, b_hi), and(eq(hi, b_hi), gt(lo, b_lo))))
         }
     }
 
     function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
+        /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's
+        /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead.
         unchecked {
             // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input
             // for the top-limb cube root extraction.
@@ -1860,13 +1860,13 @@ library Lib512MathArithmetic {
             //   floor((res * B + x2) / (3 * r_hi^2)).
             uint256 n_hi = res >> 170;
             uint256 n_lo = (res << 86) | x2;
-            uint256 r_lo = n_hi == 0 ? n_lo / d : _div(n_hi, n_lo, d);
+            uint256 r_lo = n_hi == 0 ? n_lo.unsafeDiv(d) : _div(n_hi, n_lo, d);
 
             // Second-order correction for the omitted quadratic term in the first-limb lift.
             // With B = 2^86, q := r_lo, and a := r_hi:
             //   q' = q - floor(q^2 / (a * B)).
             // This removes almost all of the overshoot from the first-order estimate.
-            r_lo -= (r_lo * r_lo) / (r_hi << 86);
+            r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86);
             r = (r_hi << 86) + r_lo;
 
             // Exact floor correction.

From cd876e1b478db1d1f6622ec312a5f21073cd91e6 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:52:49 +0100
Subject: [PATCH 06/38] Golf

---
 src/utils/512Math.sol | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 1f6863339..0fa300514 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1858,9 +1858,23 @@ library Lib512MathArithmetic {
 
             // Recover low 86-bit limb from:
             //   floor((res * B + x2) / (3 * r_hi^2)).
-            uint256 n_hi = res >> 170;
-            uint256 n_lo = (res << 86) | x2;
-            uint256 r_lo = n_hi == 0 ? n_lo.unsafeDiv(d) : _div(n_hi, n_lo, d);
+            // `res * B + x2` is up to 257 bits. Here `c := res >> 170` is a single
+            // carry bit (`res < 2^171` for this normalized range), so we can avoid
+            // 512-bit division by folding that carry manually, exactly as in `_sqrt`.
+            uint256 r_lo;
+            assembly ("memory-safe") {
+                let n := or(shl(86, res), x2)
+                r_lo := div(n, d)
+
+                let c := shr(170, res)
+                if c {
+                    let neg_c := not(0x00)
+                    let rem := mod(n, d)
+                    r_lo := add(r_lo, div(neg_c, d))
+                    rem := add(rem, add(0x01, mod(neg_c, d)))
+                    r_lo := add(r_lo, div(rem, d))
+                }
+            }
 
             // Second-order correction for the omitted quadratic term in the first-limb lift.
             // With B = 2^86, q := r_lo, and a := r_hi:

From 669696cbe8090f377898f0ac33321a39cf4959d3 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 21:59:43 +0100
Subject: [PATCH 07/38] Golf

---
 src/utils/512Math.sol | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 0fa300514..c870b8cf4 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1851,7 +1851,21 @@ library Lib512MathArithmetic {
             }
 
             // First limb from 256-bit cbrt.
-            uint256 r_hi = x3.cbrt();
+            // Here x3 is always in [2^251, 2^254), so the Solady seed
+            // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85.
+            uint256 r_hi;
+            assembly ("memory-safe") {
+                r_hi := shl(shr(253, x3), shl(84, 1))
+
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := sub(r_hi, lt(div(x3, mul(r_hi, r_hi)), r_hi))
+            }
             uint256 r_hi_sq = r_hi * r_hi;
             uint256 res = x3 - r_hi_sq * r_hi;
             uint256 d = 3 * r_hi_sq;

From aa9499f6648ce3954b56ef0fa5f50128a9da3ea7 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 22:01:42 +0100
Subject: [PATCH 08/38] Golf

---
 src/utils/512Math.sol | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index c870b8cf4..862e8819d 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1855,7 +1855,7 @@ library Lib512MathArithmetic {
             // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85.
             uint256 r_hi;
             assembly ("memory-safe") {
-                r_hi := shl(shr(253, x3), shl(84, 1))
+                r_hi := 0x1000000000000000000000
 
                 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
                 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)

From 00660e3c154ab9060920b3f13500f0047805f7c1 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 22:37:04 +0100
Subject: [PATCH 09/38] Golf

---
 src/utils/512Math.sol | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 862e8819d..c180a09fd 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1813,24 +1813,6 @@ library Lib512MathArithmetic {
         return osqrtUp(r, r);
     }
 
-    function _cubeGt(uint256 a, uint256 b_hi, uint256 b_lo) private pure returns (bool r) {
-        (uint256 a2_hi, uint256 a2_lo) = _mul(a, a);
-        assembly ("memory-safe") {
-            let mm := mulmod(a2_lo, a, not(0x00))
-            let lo := mul(a2_lo, a)
-            let hi_lo := sub(sub(mm, lo), lt(mm, lo))
-
-            mm := mulmod(a2_hi, a, not(0x00))
-            let cross_lo := mul(a2_hi, a)
-            let cross_hi := sub(sub(mm, cross_lo), lt(mm, cross_lo))
-
-            let hi := add(hi_lo, cross_lo)
-            let ext := add(cross_hi, lt(hi, hi_lo))
-
-            r := or(ext, or(gt(hi, b_hi), and(eq(hi, b_hi), gt(lo, b_lo))))
-        }
-    }
-
     function _cbrt(uint256 x_hi, uint256 x_lo) private pure returns (uint256 r) {
         /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's
         /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead.
@@ -1898,7 +1880,20 @@ library Lib512MathArithmetic {
             r = (r_hi << 86) + r_lo;
 
             // Exact floor correction.
-            r = r.unsafeDec(_cubeGt(r, x_hi, x_lo));
+            assembly ("memory-safe") {
+                let mm0 := mulmod(r, r, not(0x00))
+                let r2_lo := mul(r, r)
+                let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+
+                let mm1 := mulmod(r2_lo, r, not(0x00))
+                let lo := mul(r2_lo, r)
+                let hi_lo := sub(sub(mm1, lo), lt(mm1, lo))
+
+                let cross_lo := mul(r2_hi, r)
+                let hi := add(hi_lo, cross_lo)
+
+                r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
+            }
 
             return r >> shift;
         }

From e783514e0ba1f2a3167aa8b2f37cfda714c3e277 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 22:45:57 +0100
Subject: [PATCH 10/38] Golf

---
 src/utils/512Math.sol | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index c180a09fd..78055aba2 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1819,9 +1819,8 @@ library Lib512MathArithmetic {
         unchecked {
             // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input
             // for the top-limb cube root extraction.
-            uint256 shift = x_hi.clz();
-            (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift - shift % 3);
-            shift /= 3;
+            uint256 shift = x_hi.clz() / 3;
+            (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3);
 
             // Split x into base B = 2^86 "limbs":
             //   x = x3 * B^3 + x2 * B^2 + x1 * B + x0

From 8cd4b21adcc09d55ec80843fd5f01461a04e0711 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Mon, 23 Feb 2026 22:58:54 +0100
Subject: [PATCH 11/38] Cleanup

---
 src/vendor/Cbrt.sol | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol
index cb0f80e0f..3604c919c 100644
--- a/src/vendor/Cbrt.sol
+++ b/src/vendor/Cbrt.sol
@@ -9,7 +9,6 @@ library Cbrt {
     /// Formally verified by xuwinnie:
     /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf
     function cbrt(uint256 x) internal pure returns (uint256 z) {
-        /// @solidity memory-safe-assembly
         assembly ("memory-safe") {
             // Initial guess z = 2^ceil((log2(x) + 2) / 3).
             // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1)

From b6445ee69faaa8aa9c4a05e213b78be340743adb Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 08:54:20 +0100
Subject: [PATCH 12/38] Golf

---
 src/utils/512Math.sol | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 78055aba2..094e229c0 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1835,6 +1835,8 @@ library Lib512MathArithmetic {
             // Here x3 is always in [2^251, 2^254), so the Solady seed
             // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85.
             uint256 r_hi;
+            uint256 res;
+            uint256 d;
             assembly ("memory-safe") {
                 r_hi := 0x1000000000000000000000
 
@@ -1844,12 +1846,26 @@ library Lib512MathArithmetic {
                 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
                 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
                 r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := sub(r_hi, lt(div(x3, mul(r_hi, r_hi)), r_hi))
+
+                let r_hi_sq := mul(r_hi, r_hi)
+                let r_hi_cube := mul(r_hi_sq, r_hi)
+                if gt(r_hi_cube, x3) {
+                    // We gate the 7th Newton-Raphson step on whether it has overestimated. The
+                    // second-order correction below is sufficient to correct for small
+                    // underestimation. This branch is net gas-optimizing.
+                    r_hi := div(add(add(div(x3, r_hi_sq), r_hi), r_hi), 3)
+                    r_hi_sq := mul(r_hi, r_hi)
+                    r_hi_cube := mul(r_hi_sq, r_hi)
+                    if gt(r_hi_cube, x3) {
+                        r_hi := sub(r_hi, 0x01)
+                        r_hi_sq := mul(r_hi, r_hi)
+                        r_hi_cube := mul(r_hi_sq, r_hi)
+                    }
+                }
+
+                res := sub(x3, r_hi_cube)
+                d := mul(3, r_hi_sq)
             }
-            uint256 r_hi_sq = r_hi * r_hi;
-            uint256 res = x3 - r_hi_sq * r_hi;
-            uint256 d = 3 * r_hi_sq;
 
             // Recover low 86-bit limb from:
             //   floor((res * B + x2) / (3 * r_hi^2)).

From a27a7e0b7a207ea5781168a19fa7caa8573f8a6b Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 09:31:04 +0100
Subject: [PATCH 13/38] Cleanup

---
 src/utils/512Math.sol | 80 +++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 094e229c0..d0f35ec02 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1700,7 +1700,8 @@ library Lib512MathArithmetic {
         /// https://inria.hal.science/inria-00072854/document with the helpers from Solady and
         /// 512Math. This approach is inspired by https://github.com/SimonSuckut/Solidity_Uint512/
         unchecked {
-            // Normalize `x` so the top word has its MSB in bit 255 or 254.
+            // Normalize `x` so the top word has its MSB in bit 255 or 254. This makes the "shift
+            // back" step exact.
             //   x ≥ 2⁵¹⁰
             uint256 shift = x_hi.clz();
             (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift & 0xfe);
@@ -1817,67 +1818,69 @@ library Lib512MathArithmetic {
         /// This is the same general technique as we applied in `_sqrt`, patterned after Zimmerman's
         /// "Karatsuba Square Root" algorithm, but adapted to compute cube roots instead.
         unchecked {
-            // Normalize `x` by a multiple of 3 so `x_hi >> 2` is a well-conditioned input
-            // for the top-limb cube root extraction.
+            // Normalize `x` so that its MSB is in bit 255, 254, or 253. This makes the left shift a
+            // multiple of 3 so that the "shift back" un-normalization step is exact.
+            //   x ≥ 2⁵⁰⁹
             uint256 shift = x_hi.clz() / 3;
             (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3);
 
-            // Split x into base B = 2^86 "limbs":
-            //   x = x3 * B^3 + x2 * B^2 + x1 * B + x0
-            // where x3 is 254 bits and x2/x1/x0 are 86-bit limbs.
-            uint256 x3 = x_hi >> 2;
-            uint256 x2;
+            // Zimmerman's "Karatsuba Square Root" algorithm works with limbs of `r` that are half
+            // of a word. For cube root, we use limbs of `r` that are one third of a word. The
+            // initial step to compute the first "limb" of `r` uses the "normal" cube root algorithm
+            // and consumes the first (almost) word of `x`. The second and final limb of `r` is
+            // computed using an analogue of the Karatsuba step from the original algorithm,
+            // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after
+            // the first whole-ish word.
+            uint256 limb_hi;
             assembly ("memory-safe") {
-                x2 := or(shl(84, and(x_hi, 0x03)), shr(172, x_lo))
+                limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo))
             }
 
-            // First limb from 256-bit cbrt.
-            // Here x3 is always in [2^251, 2^254), so the Solady seed
-            // 2^ceil((log2(x3) + 2) / 3) is exactly either 2^84 or 2^85.
+            // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we
+            // store in `r_hi`. `res` is the residue after this first operation and `d` is the
+            // derivative/denominator for the subsequent outer Newton step of the Karatsuba "loop".
             uint256 r_hi;
             uint256 res;
             uint256 d;
             assembly ("memory-safe") {
-                r_hi := 0x1000000000000000000000
+                let w := shr(2, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization
+                r_hi := 0x1000000000000000000000 // Given `w` in its range, this seed is suitable
 
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
-                r_hi := div(add(add(div(x3, mul(r_hi, r_hi)), r_hi), r_hi), 3)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
+                r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)
 
                 let r_hi_sq := mul(r_hi, r_hi)
                 let r_hi_cube := mul(r_hi_sq, r_hi)
-                if gt(r_hi_cube, x3) {
-                    // We gate the 7th Newton-Raphson step on whether it has overestimated. The
-                    // second-order correction below is sufficient to correct for small
-                    // underestimation. This branch is net gas-optimizing.
-                    r_hi := div(add(add(div(x3, r_hi_sq), r_hi), r_hi), 3)
+                if gt(r_hi_cube, w) {
+                    // We gate the 7th Newton-Raphson step and ceil/floor cleanup on whether it has
+                    // overestimated. The second-order correction further below is sufficient to
+                    // correct for small underestimation. This branch is net gas-optimizing.
+                    r_hi := div(add(add(div(w, r_hi_sq), r_hi), r_hi), 0x03)
+                    r_hi := sub(r_hi, lt(div(w, mul(r_hi, r_hi)), r_hi))
                     r_hi_sq := mul(r_hi, r_hi)
                     r_hi_cube := mul(r_hi_sq, r_hi)
-                    if gt(r_hi_cube, x3) {
-                        r_hi := sub(r_hi, 0x01)
-                        r_hi_sq := mul(r_hi, r_hi)
-                        r_hi_cube := mul(r_hi_sq, r_hi)
-                    }
                 }
 
-                res := sub(x3, r_hi_cube)
-                d := mul(3, r_hi_sq)
+                res := sub(w, r_hi_cube)
+                d := mul(0x03, r_hi_sq)
             }
 
-            // Recover low 86-bit limb from:
-            //   floor((res * B + x2) / (3 * r_hi^2)).
-            // `res * B + x2` is up to 257 bits. Here `c := res >> 170` is a single
-            // carry bit (`res < 2^171` for this normalized range), so we can avoid
-            // 512-bit division by folding that carry manually, exactly as in `_sqrt`.
+            // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost):
+            //   r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋
+            //   res = (res ⋅ 2⁸⁶ + limb_hi) % (3 ⋅ r_hi²)
+            // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root step.
             uint256 r_lo;
             assembly ("memory-safe") {
-                let n := or(shl(86, res), x2)
+                let n := or(shl(0x56, res), limb_hi)
                 r_lo := div(n, d)
 
-                let c := shr(170, res)
+                let c := shr(0xaa, res)
+                // If `res` was 171 bits (one more than expected), then `n` overflowed to 257
+                // bits. Explicitly handling the carry avoids 512-bit division.
                 if c {
                     let neg_c := not(0x00)
                     let rem := mod(n, d)
@@ -1910,6 +1913,7 @@ library Lib512MathArithmetic {
                 r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
             }
 
+            // Un-normalize
             return r >> shift;
         }
     }

From 243ee2eb20cda610cd8026619830428e7d03dcf5 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 09:45:01 +0100
Subject: [PATCH 14/38] Cleanup

---
 src/utils/512Math.sol | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index d0f35ec02..ed58b25c1 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1838,7 +1838,7 @@ library Lib512MathArithmetic {
 
             // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we
             // store in `r_hi`. `res` is the residue after this first operation and `d` is the
-            // derivative/denominator for the subsequent outer Newton step of the Karatsuba "loop".
+            // derivative/denominator for the subsequent Karatsuba step.
             uint256 r_hi;
             uint256 res;
             uint256 d;
@@ -1890,14 +1890,13 @@ library Lib512MathArithmetic {
                 }
             }
 
-            // Second-order correction for the omitted quadratic term in the first-limb lift.
-            // With B = 2^86, q := r_lo, and a := r_hi:
-            //   q' = q - floor(q^2 / (a * B)).
-            // This removes almost all of the overshoot from the first-order estimate.
+            // Unlike the square-root case, the error from the linear Karatsuba step can still be
+            // large because the expansion has more terms. We do a quadratic correction to get close
+            // enough that the single subtraction is sufficient for exactness.
             r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86);
             r = (r_hi << 86) + r_lo;
 
-            // Exact floor correction.
+            // Our error is now down to 1ulp. Check if `r` overshoots and correct.
             assembly ("memory-safe") {
                 let mm0 := mulmod(r, r, not(0x00))
                 let r2_lo := mul(r, r)
@@ -1905,10 +1904,7 @@ library Lib512MathArithmetic {
 
                 let mm1 := mulmod(r2_lo, r, not(0x00))
                 let lo := mul(r2_lo, r)
-                let hi_lo := sub(sub(mm1, lo), lt(mm1, lo))
-
-                let cross_lo := mul(r2_hi, r)
-                let hi := add(hi_lo, cross_lo)
+                let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
 
                 r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
             }

From 2b0fb15fe9ca3cf5761fe1073fe717360b6ef42c Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 10:16:18 +0100
Subject: [PATCH 15/38] Add a fuzz test to explicitly exercise a known
 weak-point of some `cbrt` implementations

---
 test/0.8.25/512Math.t.sol | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index af23c317a..8b6e5b5b0 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -437,6 +437,12 @@ contract Lib512MathTest is Test {
         }
     }
 
+    function test512Math_cbrt_perfectCube(uint256 r) external pure {
+        r = bound(r, 1, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628);
+        uint512 x = alloc().omul(r, r).imul(r);
+        assertEq(x.cbrt(), r);
+    }
+
     function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure {
         s = bound(s, 0, 512);
 

From 3a4e2882ba46ae2eb8e2a079529676908d5fb407 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 10:43:54 +0100
Subject: [PATCH 16/38] WIP: 512-bit cbrtUp

---
 test/0.8.25/512Math.t.sol | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index 8b6e5b5b0..2d4a4df2f 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -443,6 +443,33 @@ contract Lib512MathTest is Test {
         assertEq(x.cbrt(), r);
     }
 
+    function test512Math_cbrtUp(uint256 x_hi, uint256 x_lo) external pure {
+        uint512 x = alloc().from(x_hi, x_lo);
+        uint256 r = x.cbrtUp();
+        uint512 r3 = alloc().omul(r, r).imul(r);
+
+        if (
+            x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                    && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d7a00)
+        ) {
+            assertEq(r, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885629, "cbrtUp overflow");
+        } else {
+            assertTrue(r3 >= x, "cbrtUp too low");
+        }
+        if (x_hi != 0 && x_lo != 0) {
+            r--;
+            r3.omul(r, r).imul(r);
+            assertTrue(r3 < x, "cbrtUp too high");
+        }
+    }
+
+    function test512Math_cbrtUp_perfectCube(uint256 r) external pure {
+        r = bound(r, 1, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628);
+        uint512 x = alloc().omul(r, r).imul(r);
+        assertEq(x.cbrtUp(), r);
+    }
+
     function test512Math_oshrUp(uint256 x_hi, uint256 x_lo, uint256 s) external pure {
         s = bound(s, 0, 512);
 

From d1fedff4a492a121c91465ca65998bfdf293e4b8 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 10:44:32 +0100
Subject: [PATCH 17/38] `cbrtUp`

---
 src/utils/512Math.sol | 58 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index ed58b25c1..9cc308f29 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -184,6 +184,7 @@ WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING
 /// ### Cube root
 ///
 /// * cbrt(uint512) returns (uint256)
+/// * cbrtUp(uint512) returns (uint256)
 ///
 /// ### Shifting
 ///
@@ -1895,33 +1896,60 @@ library Lib512MathArithmetic {
             // enough that the single subtraction is sufficient for exactness.
             r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86);
             r = (r_hi << 86) + r_lo;
-
-            // Our error is now down to 1ulp. Check if `r` overshoots and correct.
-            assembly ("memory-safe") {
-                let mm0 := mulmod(r, r, not(0x00))
-                let r2_lo := mul(r, r)
-                let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
-
-                let mm1 := mulmod(r2_lo, r, not(0x00))
-                let lo := mul(r2_lo, r)
-                let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
-
-                r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
-            }
+            // Our error is now down to 1ulp.
 
             // Un-normalize
             return r >> shift;
         }
     }
 
-    function cbrt(uint512 x) internal pure returns (uint256) {
+    function cbrt(uint512 x) internal pure returns (uint256 r) {
         (uint256 x_hi, uint256 x_lo) = x.into();
 
         if (x_hi == 0) {
             return x_lo.cbrt();
         }
 
-        return _cbrt(x_hi, x_lo);
+        r = _cbrt(x_hi, x_lo);
+
+        // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct.
+        assembly ("memory-safe") {
+            let mm0 := mulmod(r, r, not(0x00))
+            let r2_lo := mul(r, r)
+            let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+
+            let mm1 := mulmod(r2_lo, r, not(0x00))
+            let lo := mul(r2_lo, r)
+            let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+
+            r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
+        }
+    }
+
+    function cbrtUp(uint512 x) internal pure returns (uint256 r) {
+        (uint256 x_hi, uint256 x_lo) = x.into();
+
+        if (x_hi == 0) {
+            r = x_lo.cbrt();
+            unchecked {
+                r = r.unsafeInc(r * r * r < x_lo);
+            }
+        } else {
+            r = _cbrt(x_hi, x_lo);
+
+            // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
+            assembly ("memory-safe") {
+                let mm0 := mulmod(r, r, not(0x00))
+                let r2_lo := mul(r, r)
+                let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+
+                let mm1 := mulmod(r2_lo, r, not(0x00))
+                let lo := mul(r2_lo, r)
+                let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+
+                r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo))))
+            }
+        }
     }
 
     function oshr(uint512 r, uint512 x, uint256 s) internal pure returns (uint512) {

From 09a9018cf8bc570235401c3d879e37fe63ccb5f3 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 11:00:47 +0100
Subject: [PATCH 18/38] Golf

---
 src/utils/512Math.sol |  5 +----
 src/vendor/Cbrt.sol   | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 9cc308f29..81bb596bf 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1930,10 +1930,7 @@ library Lib512MathArithmetic {
         (uint256 x_hi, uint256 x_lo) = x.into();
 
         if (x_hi == 0) {
-            r = x_lo.cbrt();
-            unchecked {
-                r = r.unsafeInc(r * r * r < x_lo);
-            }
+            r = x_lo.cbrtUp();
         } else {
             r = _cbrt(x_hi, x_lo);
 
diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol
index 3604c919c..e1465ba2a 100644
--- a/src/vendor/Cbrt.sol
+++ b/src/vendor/Cbrt.sol
@@ -8,7 +8,7 @@ library Cbrt {
     /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy
     /// Formally verified by xuwinnie:
     /// https://github.com/vectorized/solady/blob/main/audits/xuwinnie-solady-cbrt-proof.pdf
-    function cbrt(uint256 x) internal pure returns (uint256 z) {
+    function _cbrt(uint256 x) private pure returns (uint256 z) {
         assembly ("memory-safe") {
             // Initial guess z = 2^ceil((log2(x) + 2) / 3).
             // Since log2(x) = 255 - clz(x), the expression shl((257 - clz(x)) / 3, 1)
@@ -22,8 +22,22 @@ library Cbrt {
             z := div(add(add(div(x, mul(z, z)), z), z), 3)
             z := div(add(add(div(x, mul(z, z)), z), z), 3)
             z := div(add(add(div(x, mul(z, z)), z), z), 3)
+        }
+    }
+
+    function cbrt(uint256 x) internal pure returns (uint256 z) {
+        z = _cbrt(x);
+        assembly ("memory-safe") {
             // Round down.
             z := sub(z, lt(div(x, mul(z, z)), z))
         }
     }
+
+    function cbrtUp(uint256 x) internal pure returns (uint256 z) {
+        z = _cbrt(x);
+        assembly ("memory-safe") {
+            // Round up.
+            z := add(z, lt(mul(z, mul(z, z)), x))
+        }
+    }
 }

From 9a361bd00a6a68f44278c05113e065600abd2cd7 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 11:07:40 +0100
Subject: [PATCH 19/38] Improve test strictness

---
 test/0.8.25/512Math.t.sol | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index 2d4a4df2f..037d972d4 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -428,9 +428,13 @@ contract Lib512MathTest is Test {
         uint512 r3 = alloc().omul(r, r).imul(r);
 
         assertTrue(r3 <= x, "cbrt too high");
-        if (!(x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
-                    || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
-                        && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff))) {
+        if (
+            x_hi > 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                || (x_hi == 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f
+                    && x_lo > 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d79ff)
+        ) {
+            assertEq(r, 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628, "cbrt overflow");
+        } else {
             r++;
             r3.omul(r, r).imul(r);
             assertTrue(r3 > x, "cbrt too low");

From d0959f67a777d02bf5175a4d8d8bcb6138fd5b92 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 11:10:49 +0100
Subject: [PATCH 20/38] Improve test strictness

---
 test/0.8.25/512Math.t.sol | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index 037d972d4..2134d15ff 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -461,7 +461,7 @@ contract Lib512MathTest is Test {
         } else {
             assertTrue(r3 >= x, "cbrtUp too low");
         }
-        if (x_hi != 0 && x_lo != 0) {
+        if (x_hi != 0 || x_lo != 0) {
             r--;
             r3.omul(r, r).imul(r);
             assertTrue(r3 < x, "cbrtUp too high");

From a5a8fcefae182fad61f082384bcf5f4d5c82c714 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 11:41:52 +0100
Subject: [PATCH 21/38] Comment

---
 src/utils/512Math.sol | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 81bb596bf..61d021b7a 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1826,10 +1826,10 @@ library Lib512MathArithmetic {
             (, x_hi, x_lo) = _shl256(x_hi, x_lo, shift * 3);
 
             // Zimmerman's "Karatsuba Square Root" algorithm works with limbs of `r` that are half
-            // of a word. For cube root, we use limbs of `r` that are one third of a word. The
-            // initial step to compute the first "limb" of `r` uses the "normal" cube root algorithm
-            // and consumes the first (almost) word of `x`. The second and final limb of `r` is
-            // computed using an analogue of the Karatsuba step from the original algorithm,
+            // of a word. For cube root, we use limbs of `r` that are (roughly) one third of a
+            // word. The initial step to compute the first "limb" of `r` uses the "normal" cube root
+            // algorithm and consumes the first (almost) word of `x`. The second and final limb of
+            // `r` is computed using an analogue of the Karatsuba step from the original algorithm,
             // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after
             // the first whole-ish word.
             uint256 limb_hi;

From c573dab42713a8d311575ed66be4adbca0da240a Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 11:59:04 +0100
Subject: [PATCH 22/38] Comment

---
 src/utils/512Math.sol | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 61d021b7a..54b94f217 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1872,8 +1872,9 @@ library Lib512MathArithmetic {
 
             // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost):
             //   r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋
-            //   res = (res ⋅ 2⁸⁶ + limb_hi) % (3 ⋅ r_hi²)
-            // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root step.
+            // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root
+            // step. We discard `res` after this step and perform a quadratic correction instead of
+            // the underflow check from Zimmerman
             uint256 r_lo;
             assembly ("memory-safe") {
                 let n := or(shl(0x56, res), limb_hi)
@@ -1894,6 +1895,11 @@ library Lib512MathArithmetic {
             // Unlike the square-root case, the error from the linear Karatsuba step can still be
             // large because the expansion has more terms. We do a quadratic correction to get close
             // enough that the single subtraction is sufficient for exactness.
+            //
+            // In the square-root version, the only ignored term in (s + q)² is q², which is small
+            // enough for a 1ulp correction. For cube root, the binomial expansion (r_hi·2⁸⁶ +
+            // r_lo)³ contains the cross term 3·(r_hi·2⁸⁶)·r_lo². The linear Karatsuba step
+            // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only r_lo³
             r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86);
             r = (r_hi << 86) + r_lo;
             // Our error is now down to 1ulp.

From 67cc8e4d6a10b2c23d31e42574ae41e56179fb74 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:04:31 +0100
Subject: [PATCH 23/38] Consistency

---
 src/utils/512Math.sol | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 54b94f217..c15ae36cb 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1936,22 +1936,22 @@ library Lib512MathArithmetic {
         (uint256 x_hi, uint256 x_lo) = x.into();
 
         if (x_hi == 0) {
-            r = x_lo.cbrtUp();
-        } else {
-            r = _cbrt(x_hi, x_lo);
+            return x_lo.cbrtUp();
+        }
 
-            // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
-            assembly ("memory-safe") {
-                let mm0 := mulmod(r, r, not(0x00))
-                let r2_lo := mul(r, r)
-                let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+        r = _cbrt(x_hi, x_lo);
 
-                let mm1 := mulmod(r2_lo, r, not(0x00))
-                let lo := mul(r2_lo, r)
-                let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+        // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
+        assembly ("memory-safe") {
+            let mm0 := mulmod(r, r, not(0x00))
+            let r2_lo := mul(r, r)
+            let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
 
-                r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo))))
-            }
+            let mm1 := mulmod(r2_lo, r, not(0x00))
+            let lo := mul(r2_lo, r)
+            let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+
+            r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo))))
         }
     }
 

From a1c9d47a7f21db2fd7c6b8cb8a5cb4c590e2e18f Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:10:43 +0100
Subject: [PATCH 24/38] Comment

---
 src/utils/512Math.sol | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index c15ae36cb..176e95ae0 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1899,7 +1899,8 @@ library Lib512MathArithmetic {
             // In the square-root version, the only ignored term in (s + q)² is q², which is small
             // enough for a 1ulp correction. For cube root, the binomial expansion (r_hi·2⁸⁶ +
             // r_lo)³ contains the cross term 3·(r_hi·2⁸⁶)·r_lo². The linear Karatsuba step
-            // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only r_lo³
+            // overestimates r_lo by ≈r_lo²/(r_hi·2⁸⁶). After correction, this leaves only the r_lo³
+            // term, on the order of 2²⁵⁸/(3·2³⁴²), much less than 1ulp.
             r_lo -= (r_lo * r_lo).unsafeDiv(r_hi << 86);
             r = (r_hi << 86) + r_lo;
             // Our error is now down to 1ulp.

From 96732f446f9dd823ec62dd25ae1007ac597a51b2 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:26:41 +0100
Subject: [PATCH 25/38] Style

---
 src/utils/512Math.sol | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 176e95ae0..eceb362b0 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1844,7 +1844,7 @@ library Lib512MathArithmetic {
             uint256 res;
             uint256 d;
             assembly ("memory-safe") {
-                let w := shr(2, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization
+                let w := shr(0x02, x_hi) // w ≥ 2²⁵¹; w < 2²⁵⁴ from the above normalization
                 r_hi := 0x1000000000000000000000 // Given `w` in its range, this seed is suitable
 
                 r_hi := div(add(add(div(w, mul(r_hi, r_hi)), r_hi), r_hi), 0x03)

From 2cd9eccd37e743aa8b14846dd9d6ebd8fd3a0a67 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:28:34 +0100
Subject: [PATCH 26/38] Style

---
 src/utils/512Math.sol | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index eceb362b0..4a9c88f1c 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1830,12 +1830,7 @@ library Lib512MathArithmetic {
             // word. The initial step to compute the first "limb" of `r` uses the "normal" cube root
             // algorithm and consumes the first (almost) word of `x`. The second and final limb of
             // `r` is computed using an analogue of the Karatsuba step from the original algorithm,
-            // followed by a pair of cleanup steps. `limb_hi` is the next 86-bit limb of `x` after
-            // the first whole-ish word.
-            uint256 limb_hi;
-            assembly ("memory-safe") {
-                limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo))
-            }
+            // followed by a pair of cleanup steps.
 
             // Now we run the "normal" cube root algorithm to obtain the first limb of `r`, which we
             // store in `r_hi`. `res` is the residue after this first operation and `d` is the
@@ -1870,6 +1865,11 @@ library Lib512MathArithmetic {
                 d := mul(0x03, r_hi_sq)
             }
 
+            // `limb_hi` is the next 86-bit limb of `x` after the first whole-ish word.
+            uint256 limb_hi;
+            assembly ("memory-safe") {
+                limb_hi := or(shl(0x54, and(x_hi, 0x03)), shr(0xac, x_lo))
+            }
             // This is the Karatsuba step. The 86-bit lower limb of `r` is (almost):
             //   r_lo = ⌊(res ⋅ 2⁸⁶ + limb_hi) / (3 ⋅ r_hi²)⌋
             // Where `res` is the (nearly) 2-limb residue from the previous "normal" cube root

From 0eecce176d2510cdd368284c1325d89ac9e955cf Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:30:58 +0100
Subject: [PATCH 27/38] Style

---
 src/utils/512Math.sol | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 4a9c88f1c..ff21e6b6c 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1761,9 +1761,8 @@ library Lib512MathArithmetic {
                 // It's possible that `n` was 257 bits and overflowed (`res` was not just a single
                 // limb). Explicitly handling the carry avoids 512-bit division.
                 if c {
-                    let neg_c := not(0x00)
-                    r_lo := add(r_lo, div(neg_c, d))
-                    res := add(res, add(0x01, mod(neg_c, d)))
+                    r_lo := add(r_lo, div(not(0x00), d))
+                    res := add(res, add(0x01, mod(not(0x00), d)))
                     r_lo := add(r_lo, div(res, d))
                     res := mod(res, d)
                 }
@@ -1884,10 +1883,9 @@ library Lib512MathArithmetic {
                 // If `res` was 171 bits (one more than expected), then `n` overflowed to 257
                 // bits. Explicitly handling the carry avoids 512-bit division.
                 if c {
-                    let neg_c := not(0x00)
                     let rem := mod(n, d)
-                    r_lo := add(r_lo, div(neg_c, d))
-                    rem := add(rem, add(0x01, mod(neg_c, d)))
+                    r_lo := add(r_lo, div(not(0x00), d))
+                    rem := add(rem, add(0x01, mod(not(0x00), d)))
                     r_lo := add(r_lo, div(rem, d))
                 }
             }

From 55efb2e22629648bfcc6f30d81c2aa7e226c4d62 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 12:32:17 +0100
Subject: [PATCH 28/38] Style

---
 src/utils/512Math.sol | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index ff21e6b6c..551fa79a9 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1879,10 +1879,9 @@ library Lib512MathArithmetic {
                 let n := or(shl(0x56, res), limb_hi)
                 r_lo := div(n, d)
 
-                let c := shr(0xaa, res)
                 // If `res` was 171 bits (one more than expected), then `n` overflowed to 257
                 // bits. Explicitly handling the carry avoids 512-bit division.
-                if c {
+                if shr(0xaa, res) {
                     let rem := mod(n, d)
                     r_lo := add(r_lo, div(not(0x00), d))
                     rem := add(rem, add(0x01, mod(not(0x00), d)))

From 2873a31abe14289f8184dd43ca5103da878476f0 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 16:19:16 +0100
Subject: [PATCH 29/38] Natspec

---
 src/vendor/Cbrt.sol | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol
index e1465ba2a..ccd471dd2 100644
--- a/src/vendor/Cbrt.sol
+++ b/src/vendor/Cbrt.sol
@@ -3,7 +3,7 @@ pragma solidity ^0.8.25;
 
 // @author Modified from Solady by Vectorized and Akshay Tarpara https://github.com/Vectorized/solady/blob/ff6256a18851749e765355b3e21dc9bfa417255b/src/utils/clz/FixedPointMathLib.sol#L799-L822 under the MIT license.
 library Cbrt {
-    /// @dev Returns the cube root of `x`, rounded down.
+    /// @dev Returns the cube root of `x`, rounded to within 1ulp.
     /// Credit to bout3fiddy and pcaversaccio under AGPLv3 license:
     /// https://github.com/pcaversaccio/snekmate/blob/main/src/snekmate/utils/math.vy
     /// Formally verified by xuwinnie:
@@ -25,6 +25,7 @@ library Cbrt {
         }
     }
 
+    /// @dev Returns the cube root of `x`, rounded down.
     function cbrt(uint256 x) internal pure returns (uint256 z) {
         z = _cbrt(x);
         assembly ("memory-safe") {
@@ -33,6 +34,7 @@ library Cbrt {
         }
     }
 
+    /// @dev Returns the cube root of `x`, rounded up.
     function cbrtUp(uint256 x) internal pure returns (uint256 z) {
         z = _cbrt(x);
         assembly ("memory-safe") {

From e8fe93335adc1c36818dea2d5696b3c6a4df4cdd Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 18:23:46 +0100
Subject: [PATCH 30/38] Document overflow safety of cbrt/cbrtUp
 cube-and-compare
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add detailed comments explaining why _cbrt never returns r_max + 1 in
the overflow regime (x > r_max^3), which would cause the 256-bit
cube-and-compare to silently produce wrong results. The argument traces
through the EVM execution: the truncated div(n, d) and carry-adjusted
r_lo_pre are both constant across the regime because res varies by only
~0.62·2^83 and the quotient's fractional part stays in [0.118, 0.283].

Add fuzz test targeting the overflow-cube regime specifically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/utils/512Math.sol     | 34 ++++++++++++++++++++++++++++++++++
 test/0.8.25/512Math.t.sol | 24 ++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 551fa79a9..b98789efe 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1916,6 +1916,38 @@ library Lib512MathArithmetic {
 
         r = _cbrt(x_hi, x_lo);
 
+        // Detailed note for the top-of-range regime:
+        // Let
+        //   r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628
+        // and consider x in [r_max^3 + 1, 2^512 - 1]. In this regime, cubing r_max + 1 would
+        // overflow 512 bits, but `_cbrt` still returns r_max (never r_max + 1):
+        // 1) shift = clz(x_hi) / 3 = 0 for all such x.
+        // 2) w = x_hi >> 2 lies in [0x3f..fb0959fdf442978718ddcb, 2^254 - 1].
+        // 3) In that full interval, floor(cbrt(w)) is constant:
+        //      r_hi = 0x1965fea53d6e3c82b05999
+        //    because r_hi^3 <= w < (r_hi + 1)^3.
+        // 4) Therefore d = 3 * r_hi^2 is constant:
+        //      d = 0x78f3d1d950af414cd731fe48f48fde1309821333853.
+        // 5) In the EVM code, n = shl(86, res) | limb_hi truncates to 256 bits (res is
+        //    171 bits, so res << 86 is 257 bits). The truncated div(n, d) is constant:
+        //      div(n, d) = 0x8f3a38c7f3364c49d3405  for all x in the regime.
+        //    The carry branch (shr(0xaa, res) != 0) fires for the entire regime since
+        //    res_min = 0x50ea... is already 171 bits. After carry adjustment, r_lo_pre
+        //    is constant (see step 6). Both the pre-carry and post-carry quotients stay
+        //    in one bucket because res varies by only ~0.62·2^83 across the regime,
+        //    and limb_hi's full 86-bit range contributes < 2^{-84} to n/d (negligible
+        //    vs d ≈ 2^170.8). Total swing in the continuous quotient is ~0.164.
+        //    At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max^3) and ≈ 0.283 (at
+        //    x = 2^512 - 1), so the floor never crosses an integer boundary.
+        // 6) After the carry adjustment branch, r_lo_pre is constant:
+        //      r_lo_pre = 0x2ad0f7137bc6601d885629.
+        // 7) The quadratic correction subtracts exactly 1:
+        //      floor(r_lo_pre^2 / (r_hi << 86)) = 1
+        //    so r_lo = 0x2ad0f7137bc6601d885628 and
+        //      r = (r_hi << 86) + r_lo = r_max.
+        //
+        // So for this regime, the cube-and-compare code below only cubes r_max (which fits in
+        // 512 bits). `cbrtUp` reaches r_max + 1 only via its final +1 correction.
         // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct.
         assembly ("memory-safe") {
             let mm0 := mulmod(r, r, not(0x00))
@@ -1939,6 +1971,8 @@ library Lib512MathArithmetic {
 
         r = _cbrt(x_hi, x_lo);
 
+        // See the detailed overflow-regime note in `cbrt` above. In particular, near 2^512,
+        // `_cbrt` is pinned at r_max and does not return r_max + 1 directly.
         // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
         assembly ("memory-safe") {
             let mm0 := mulmod(r, r, not(0x00))
diff --git a/test/0.8.25/512Math.t.sol b/test/0.8.25/512Math.t.sol
index 2134d15ff..5ad0fc4b0 100644
--- a/test/0.8.25/512Math.t.sol
+++ b/test/0.8.25/512Math.t.sol
@@ -447,6 +447,30 @@ contract Lib512MathTest is Test {
         assertEq(x.cbrt(), r);
     }
 
+    function test512Math_cbrt_overflowCubeRegime(uint256 x_hi, uint256 x_lo) external pure {
+        uint256 r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628;
+        uint256 r_max_plus_one = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885629;
+        uint256 r_max_cube_hi = 0xffffffffffffffffffffffffffffffffffffffffffec2567f7d10a5e1c63772f;
+        uint256 r_max_cube_lo = 0xd70b34358c5c72dd2dbdc27132d143e3a7f08c1088df427db0884640df2d7a00;
+
+        // Force x > r_max^3 so cbrtUp(x) must return r_max + 1, whose cube is 513 bits.
+        //
+        // Why this still passes with the current implementation:
+        // `_cbrt` is returning `r_max` (not `r_max + 1`) in this regime. If `_cbrt` returned
+        // `r_max + 1`, then `cbrt` would have to decrement based on an overflowed cube-and-compare
+        // and this assertion would fail for these near-2^512 inputs. Because `cbrt` stays equal to
+        // `r_max`, both cube-and-compare paths only cube `r_max` (which fits in 512 bits), and
+        // `cbrtUp` reaches `r_max + 1` only via its final `+1` correction.
+        x_hi = bound(x_hi, r_max_cube_hi, type(uint256).max);
+        if (x_hi == r_max_cube_hi) {
+            x_lo = bound(x_lo, r_max_cube_lo + 1, type(uint256).max);
+        }
+
+        uint512 x = alloc().from(x_hi, x_lo);
+        assertEq(x.cbrt(), r_max, "cbrt in overflow-cube regime");
+        assertEq(x.cbrtUp(), r_max_plus_one, "cbrtUp in overflow-cube regime");
+    }
+
     function test512Math_cbrtUp(uint256 x_hi, uint256 x_lo) external pure {
         uint512 x = alloc().from(x_hi, x_lo);
         uint256 r = x.cbrtUp();

From cd2f23495aaea7abe93e21977f7840e2d7707349 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 18:59:42 +0100
Subject: [PATCH 31/38] Fix AI comments

---
 src/utils/512Math.sol | 66 +++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index b98789efe..48a566c8a 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1916,39 +1916,43 @@ library Lib512MathArithmetic {
 
         r = _cbrt(x_hi, x_lo);
 
-        // Detailed note for the top-of-range regime:
-        // Let
+        // The following cube-and-compare technique for obtaining the floor appears, at first, to
+        // have an overflow bug in it. Consider that `_cbrt` returns a value within 1ulp of the
+        // correct value. Define:
         //   r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628
-        // and consider x in [r_max^3 + 1, 2^512 - 1]. In this regime, cubing r_max + 1 would
-        // overflow 512 bits, but `_cbrt` still returns r_max (never r_max + 1):
-        // 1) shift = clz(x_hi) / 3 = 0 for all such x.
-        // 2) w = x_hi >> 2 lies in [0x3f..fb0959fdf442978718ddcb, 2^254 - 1].
-        // 3) In that full interval, floor(cbrt(w)) is constant:
-        //      r_hi = 0x1965fea53d6e3c82b05999
-        //    because r_hi^3 <= w < (r_hi + 1)^3.
-        // 4) Therefore d = 3 * r_hi^2 is constant:
-        //      d = 0x78f3d1d950af414cd731fe48f48fde1309821333853.
-        // 5) In the EVM code, n = shl(86, res) | limb_hi truncates to 256 bits (res is
-        //    171 bits, so res << 86 is 257 bits). The truncated div(n, d) is constant:
-        //      div(n, d) = 0x8f3a38c7f3364c49d3405  for all x in the regime.
-        //    The carry branch (shr(0xaa, res) != 0) fires for the entire regime since
-        //    res_min = 0x50ea... is already 171 bits. After carry adjustment, r_lo_pre
-        //    is constant (see step 6). Both the pre-carry and post-carry quotients stay
-        //    in one bucket because res varies by only ~0.62·2^83 across the regime,
-        //    and limb_hi's full 86-bit range contributes < 2^{-84} to n/d (negligible
-        //    vs d ≈ 2^170.8). Total swing in the continuous quotient is ~0.164.
-        //    At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max^3) and ≈ 0.283 (at
-        //    x = 2^512 - 1), so the floor never crosses an integer boundary.
-        // 6) After the carry adjustment branch, r_lo_pre is constant:
-        //      r_lo_pre = 0x2ad0f7137bc6601d885629.
+        // this means that for values of x in [r_max³ + 1, 2⁵¹² - 1], `_cbrt` could return r_max +
+        // 1, which would result in overflow when cubing `r`. However, this does not happen. Given
+        // `x` in the specified range, the `_cbrt` follows the steps below:
+        //
+        // 1) shift = clz(x_hi) / 3 = 0
+        // 2) w = x_hi >> 2 lies in [0x3fff..fffb0959fdf442978718ddcb, 2²⁵⁴ - 1]
+        // 3) In that full interval, ⌊∛w⌋ is constant. For r_hi, we get:
+        //      after 6 Newton-Raphson iterations: r_hi = 0x1965fea53d6e3c82b0c310
+        //      which forces a 7th iteration
+        //      after the branch is taken:         r_hi = 0x1965fea53d6e3c82b05999
+        // 4) Therefore d = 3 ⋅ r_hi² is constant:
+        //      d = 0x78f3d1d950af414cd731fe48f48fde1309821333853
+        // 5) n = (res << 86) | limb_hi overflows and is truncated to 256 bits. The truncated ⌊n / d⌋
+        //    is constant:
+        //      ⌊n / d⌋ = 0x8f3a38c7f3364c49d3405
+        //    The carry branch (res >> 170 != 0) fires. The carry adjustment modifies the truncated
+        //    quotient by adding:
+        //       ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224
+        //    and the final carry refinement term is zero, giving:
+        //      r_lo = 0x2ad0f7137bc6601d885629
+        //    The quotient stays in one "bucket" because `res` varies by only ~0.62·2⁸³, and
+        //    `limb_hi`'s full 86-bit range contributes <1/2⁸⁴ to n/d. Total swing in the continuous
+        //    quotient is ~0.164.  At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³ + 1) and
+        //    ≈0.283 (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary
+        // 6) After the carry adjustment branch, `r_lo` is constant:
+        //      r_lo = 0x2ad0f7137bc6601d885629
         // 7) The quadratic correction subtracts exactly 1:
-        //      floor(r_lo_pre^2 / (r_hi << 86)) = 1
+        //      ⌊r_lo² / (r_hi·2⁸⁶)⌋ = 1
         //    so r_lo = 0x2ad0f7137bc6601d885628 and
-        //      r = (r_hi << 86) + r_lo = r_max.
+        //      r = r_hi·2⁸⁶ + r_lo = r_max
         //
-        // So for this regime, the cube-and-compare code below only cubes r_max (which fits in
-        // 512 bits). `cbrtUp` reaches r_max + 1 only via its final +1 correction.
-        // `_cbrt` gives a result within 1ulp. Check if `r` overshoots and correct.
+        // So, the cube-and-compare code below only cubes a value of at most `r_max`, which fits in
+        // 512 bits. `cbrtUp` reaches `r_max + 1` only via its final +1 correction
         assembly ("memory-safe") {
             let mm0 := mulmod(r, r, not(0x00))
             let r2_lo := mul(r, r)
@@ -1971,10 +1975,10 @@ library Lib512MathArithmetic {
 
         r = _cbrt(x_hi, x_lo);
 
-        // See the detailed overflow-regime note in `cbrt` above. In particular, near 2^512,
-        // `_cbrt` is pinned at r_max and does not return r_max + 1 directly.
         // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
         assembly ("memory-safe") {
+            // See the detailed overflow-regime note in `cbrt` above. In particular, near 2⁵¹²,
+            // `_cbrt` is pinned at `r_max` and does not return `r_max + 1` directly.
             let mm0 := mulmod(r, r, not(0x00))
             let r2_lo := mul(r, r)
             let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))

From 126ed9489ce4724d63ebb9a5af4fd597fb03c9b5 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Tue, 24 Feb 2026 19:18:45 +0100
Subject: [PATCH 32/38] Pedantry

---
 src/utils/512Math.sol | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index 48a566c8a..b1c62df9b 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1920,13 +1920,13 @@ library Lib512MathArithmetic {
         // have an overflow bug in it. Consider that `_cbrt` returns a value within 1ulp of the
         // correct value. Define:
         //   r_max = 0x6597fa94f5b8f20ac16666ad0f7137bc6601d885628
-        // this means that for values of x in [r_max³ + 1, 2⁵¹² - 1], `_cbrt` could return r_max +
-        // 1, which would result in overflow when cubing `r`. However, this does not happen. Given
-        // `x` in the specified range, the `_cbrt` follows the steps below:
+        // this means that for values of x in [r_max³, 2⁵¹² - 1], `_cbrt` could return r_max + 1,
+        // which would result in overflow when cubing `r`. However, this does not happen. Given `x`
+        // in the specified range, the `_cbrt` follows the steps below:
         //
-        // 1) shift = clz(x_hi) / 3 = 0
+        // 1) shift = ⌊clz(x_hi) / 3⌋ = 0
         // 2) w = x_hi >> 2 lies in [0x3fff..fffb0959fdf442978718ddcb, 2²⁵⁴ - 1]
-        // 3) In that full interval, ⌊∛w⌋ is constant. For r_hi, we get:
+        // 3) In that full interval, ⌊∛w⌋ is constant. For `r_hi`, we get:
         //      after 6 Newton-Raphson iterations: r_hi = 0x1965fea53d6e3c82b0c310
         //      which forces a 7th iteration
         //      after the branch is taken:         r_hi = 0x1965fea53d6e3c82b05999
@@ -1937,13 +1937,13 @@ library Lib512MathArithmetic {
         //      ⌊n / d⌋ = 0x8f3a38c7f3364c49d3405
         //    The carry branch (res >> 170 != 0) fires. The carry adjustment modifies the truncated
         //    quotient by adding:
-        //       ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224
+        //      ⌊(2²⁵⁶ - 1) / d⌋ = 0x21dd5386fc92fb58eb2224
         //    and the final carry refinement term is zero, giving:
         //      r_lo = 0x2ad0f7137bc6601d885629
         //    The quotient stays in one "bucket" because `res` varies by only ~0.62·2⁸³, and
         //    `limb_hi`'s full 86-bit range contributes <1/2⁸⁴ to n/d. Total swing in the continuous
-        //    quotient is ~0.164.  At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³ + 1) and
-        //    ≈0.283 (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary
+        //    quotient is ~0.164.  At the boundaries, frac(n/d) ≈ 0.118 (at x = r_max³) and ≈0.283
+        //    (at x = 2⁵¹² - 1), so the floor never crosses an integer boundary
         // 6) After the carry adjustment branch, `r_lo` is constant:
         //      r_lo = 0x2ad0f7137bc6601d885629
         // 7) The quadratic correction subtracts exactly 1:

From 3574b35ec07e4b62e44bcb18c8a0400646157d7f Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Wed, 25 Feb 2026 12:10:57 +0100
Subject: [PATCH 33/38] Bug! Avoid overflow in `Cbrt.cbrtUp`

---
 src/vendor/Cbrt.sol | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/vendor/Cbrt.sol b/src/vendor/Cbrt.sol
index ccd471dd2..32d244444 100644
--- a/src/vendor/Cbrt.sol
+++ b/src/vendor/Cbrt.sol
@@ -38,8 +38,10 @@ library Cbrt {
     function cbrtUp(uint256 x) internal pure returns (uint256 z) {
         z = _cbrt(x);
         assembly ("memory-safe") {
-            // Round up.
-            z := add(z, lt(mul(z, mul(z, z)), x))
+            // Round up. Avoid cubing `z` to avoid overflow
+            let z2 := mul(z, z)
+            let d := div(x, z2)
+            z := add(z, gt(add(d, lt(mul(d, z2), x)), z))
         }
     }
 }

From 817bb55faafe39270680872b32813d3408e91caa Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Wed, 25 Feb 2026 12:11:51 +0100
Subject: [PATCH 34/38] Add some tests for `cbrtUp` and `sqrtUp`

---
 test/0.8.25/Cbrt.t.sol | 28 ++++++++++++++++++++++++++++
 test/0.8.25/Sqrt.t.sol | 28 ++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 test/0.8.25/Cbrt.t.sol
 create mode 100644 test/0.8.25/Sqrt.t.sol

diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol
new file mode 100644
index 000000000..a07dffebf
--- /dev/null
+++ b/test/0.8.25/Cbrt.t.sol
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+pragma solidity ^0.8.25;
+
+import {Cbrt} from "src/vendor/Cbrt.sol";
+import {Test} from "@forge-std/Test.sol";
+
+contract CbrtTest is Test {
+    using Cbrt for uint256;
+
+    uint256 private constant _CBRT_FLOOR_MAX_UINT256 = 0x285145f31ae515c447bb56;
+    uint256 private constant _CBRT_CEIL_MAX_UINT256 = 0x285145f31ae515c447bb57;
+    uint256 private constant _CBRT_FLOOR_MAX_UINT256_CUBE =
+        0xffffffffffffffffffffef214b5539a2d22f71387253e480168f34c9da3f5898;
+
+    function testCbrtUp_overflowCubeRange(uint256 x) external pure {
+        x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max);
+
+        assertEq(x.cbrt(), _CBRT_FLOOR_MAX_UINT256, "cbrt overflow-cube range");
+        assertEq(x.cbrtUp(), _CBRT_CEIL_MAX_UINT256, "cbrtUp overflow-cube range");
+    }
+
+    function testCbrtUp_overflowCubeBoundary() external pure {
+        uint256 x = _CBRT_FLOOR_MAX_UINT256_CUBE;
+
+        assertEq(x.cbrt(), _CBRT_FLOOR_MAX_UINT256, "cbrt boundary");
+        assertEq(x.cbrtUp(), _CBRT_FLOOR_MAX_UINT256, "cbrtUp boundary");
+    }
+}
diff --git a/test/0.8.25/Sqrt.t.sol b/test/0.8.25/Sqrt.t.sol
new file mode 100644
index 000000000..1a8a085ed
--- /dev/null
+++ b/test/0.8.25/Sqrt.t.sol
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+pragma solidity ^0.8.25;
+
+import {Sqrt} from "src/vendor/Sqrt.sol";
+import {Test} from "@forge-std/Test.sol";
+
+contract SqrtTest is Test {
+    using Sqrt for uint256;
+
+    uint256 private constant _SQRT_FLOOR_MAX_UINT256 = type(uint128).max;
+    uint256 private constant _SQRT_CEIL_MAX_UINT256 = uint256(1) << 128;
+    uint256 private constant _SQRT_FLOOR_MAX_UINT256_SQUARED =
+        0xfffffffffffffffffffffffffffffffe00000000000000000000000000000001;
+
+    function testSqrtUp_overflowSquareRange(uint256 x) external pure {
+        x = bound(x, _SQRT_FLOOR_MAX_UINT256_SQUARED + 1, type(uint256).max);
+
+        assertEq(x.sqrt(), _SQRT_FLOOR_MAX_UINT256, "sqrt overflow-square range");
+        assertEq(x.sqrtUp(), _SQRT_CEIL_MAX_UINT256, "sqrtUp overflow-square range");
+    }
+
+    function testSqrtUp_overflowSquareBoundary() external pure {
+        uint256 x = _SQRT_FLOOR_MAX_UINT256_SQUARED;
+
+        assertEq(x.sqrt(), _SQRT_FLOOR_MAX_UINT256, "sqrt boundary");
+        assertEq(x.sqrtUp(), _SQRT_FLOOR_MAX_UINT256, "sqrtUp boundary");
+    }
+}

From f061574491bef0af09f1fb1afb57bcad7c653011 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Wed, 25 Feb 2026 16:48:06 +0100
Subject: [PATCH 35/38] Add more tests

---
 test/0.8.25/Cbrt.t.sol | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol
index a07dffebf..6c439b934 100644
--- a/test/0.8.25/Cbrt.t.sol
+++ b/test/0.8.25/Cbrt.t.sol
@@ -12,6 +12,32 @@ contract CbrtTest is Test {
     uint256 private constant _CBRT_FLOOR_MAX_UINT256_CUBE =
         0xffffffffffffffffffffef214b5539a2d22f71387253e480168f34c9da3f5898;
 
+    function testCbrt(uint256 x) external pure {
+        uint256 r = x.cbrt();
+        assertLe(r * r * r, x, "cbrt too high");
+        if (x < _CBRT_FLOOR_MAX_UINT256_CUBE) {
+            r++;
+            assertGt(r * r * r, x, "cbrt too low");
+        } else {
+            assertEq(r, _CBRT_FLOOR_MAX_UINT256, "cbrt overflow");
+        }
+    }
+
+    function testCbrtUp(uint256 x) external pure {
+        uint256 r = x.cbrtUp();
+        if (x <= _CBRT_FLOOR_MAX_UINT256_CUBE) {
+            assertGe(r * r * r, x, "cbrtUp too low");
+        } else {
+            assertEq(r, _CBRT_CEIL_MAX_UINT256, "cbrtUp overflow");
+        }
+        if (x != 0) {
+            r--;
+            assertLt(r * r * r, x, "cbrtUp too high");
+        } else {
+            assertEq(r, 0, "cbrtUp underflow");
+        }
+    }
+    
     function testCbrtUp_overflowCubeRange(uint256 x) external pure {
         x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max);
 

From a131f5584cc70550cca7b4acf5485d8438810fcf Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Wed, 25 Feb 2026 16:49:31 +0100
Subject: [PATCH 36/38] Add more tests

---
 test/0.8.25/Sqrt.t.sol | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/test/0.8.25/Sqrt.t.sol b/test/0.8.25/Sqrt.t.sol
index 1a8a085ed..f9c051113 100644
--- a/test/0.8.25/Sqrt.t.sol
+++ b/test/0.8.25/Sqrt.t.sol
@@ -12,6 +12,32 @@ contract SqrtTest is Test {
     uint256 private constant _SQRT_FLOOR_MAX_UINT256_SQUARED =
         0xfffffffffffffffffffffffffffffffe00000000000000000000000000000001;
 
+    function testSqrt(uint256 x) external pure {
+        uint256 r = x.sqrt();
+        assertLe(r * r, x, "sqrt too high");
+        if (x < _SQRT_FLOOR_MAX_UINT256_SQUARED) {
+            r++;
+            assertGt(r * r, x, "sqrt too low");
+        } else {
+            assertEq(r, _SQRT_FLOOR_MAX_UINT256, "sqrt overflow");
+        }
+    }
+
+    function testSqrtUp(uint256 x) external pure {
+        uint256 r = x.sqrtUp();
+        if (x <= _SQRT_FLOOR_MAX_UINT256_SQUARED) {
+            assertGe(r * r, x, "sqrtUp too low");
+        } else {
+            assertEq(r, _SQRT_CEIL_MAX_UINT256, "sqrtUp overflow");
+        }
+        if (x != 0) {
+            r--;
+            assertLt(r * r, x, "sqrtUp too high");
+        } else {
+            assertEq(r, 0, "sqrtUp underflow");
+        }
+    }
+
     function testSqrtUp_overflowSquareRange(uint256 x) external pure {
         x = bound(x, _SQRT_FLOOR_MAX_UINT256_SQUARED + 1, type(uint256).max);
 

From 98adb95b127650fd59df26833b23b37859853ed6 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Wed, 25 Feb 2026 16:49:50 +0100
Subject: [PATCH 37/38] `forge fmt`

---
 test/0.8.25/Cbrt.t.sol | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/0.8.25/Cbrt.t.sol b/test/0.8.25/Cbrt.t.sol
index 6c439b934..189d3bd4c 100644
--- a/test/0.8.25/Cbrt.t.sol
+++ b/test/0.8.25/Cbrt.t.sol
@@ -37,7 +37,7 @@ contract CbrtTest is Test {
             assertEq(r, 0, "cbrtUp underflow");
         }
     }
-    
+
     function testCbrtUp_overflowCubeRange(uint256 x) external pure {
         x = bound(x, _CBRT_FLOOR_MAX_UINT256_CUBE + 1, type(uint256).max);
 

From 6e0df453af8deb7bca84c3472211f234d33bf6f6 Mon Sep 17 00:00:00 2001
From: Duncan Townsend <git@duncancmt.com>
Date: Thu, 26 Feb 2026 11:40:51 +0100
Subject: [PATCH 38/38] Comments

---
 src/utils/512Math.sol | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/src/utils/512Math.sol b/src/utils/512Math.sol
index b1c62df9b..1de99bccc 100644
--- a/src/utils/512Math.sol
+++ b/src/utils/512Math.sol
@@ -1953,16 +1953,22 @@ library Lib512MathArithmetic {
         //
         // So, the cube-and-compare code below only cubes a value of at most `r_max`, which fits in
         // 512 bits. `cbrtUp` reaches `r_max + 1` only via its final +1 correction
+        //
+        // The following assembly block is identical to
+        //   (uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
+        //   (uint256 r3_hi, uint256 r3_lo) = _mul(r2_hi, r2_lo, r);
+        //   r = r.unsafeDec(_gt(r3_hi, r3_lo, x_hi, x_lo));
+        // but is substantially more gas efficient for inexplicable reasons
         assembly ("memory-safe") {
-            let mm0 := mulmod(r, r, not(0x00))
+            let mm := mulmod(r, r, not(0x00))
             let r2_lo := mul(r, r)
-            let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+            let r2_hi := sub(sub(mm, r2_lo), lt(mm, r2_lo))
 
-            let mm1 := mulmod(r2_lo, r, not(0x00))
-            let lo := mul(r2_lo, r)
-            let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+            mm := mulmod(r2_lo, r, not(0x00))
+            let r3_lo := mul(r2_lo, r)
+            let r3_hi := add(sub(sub(mm, r3_lo), lt(mm, r3_lo)), mul(r2_hi, r))
 
-            r := sub(r, or(gt(hi, x_hi), and(eq(hi, x_hi), gt(lo, x_lo))))
+            r := sub(r, or(gt(r3_hi, x_hi), and(eq(r3_hi, x_hi), gt(r3_lo, x_lo))))
         }
     }
 
@@ -1976,18 +1982,24 @@ library Lib512MathArithmetic {
         r = _cbrt(x_hi, x_lo);
 
         // `_cbrt` gives a result within 1ulp. Check if `r` is too low and correct.
+        //
+        // The following assembly block is identical to
+        //   (uint256 r2_hi, uint256 r2_lo) = _mul(r, r);
+        //   (uint256 r3_hi, uint256 r3_lo) = _mul(r2_hi, r2_lo, r);
+        //   r = r.unsafeInc(_gt(x_hi, x_lo, r3_hi, r3_lo));
+        // but is substantially more gas efficient for inexplicable reasons
         assembly ("memory-safe") {
-            // See the detailed overflow-regime note in `cbrt` above. In particular, near 2⁵¹²,
+            // See the detailed overflow-regime note in `cbrt` above. In particular, near x = 2⁵¹²,
             // `_cbrt` is pinned at `r_max` and does not return `r_max + 1` directly.
-            let mm0 := mulmod(r, r, not(0x00))
+            let mm := mulmod(r, r, not(0x00))
             let r2_lo := mul(r, r)
-            let r2_hi := sub(sub(mm0, r2_lo), lt(mm0, r2_lo))
+            let r2_hi := sub(sub(mm, r2_lo), lt(mm, r2_lo))
 
-            let mm1 := mulmod(r2_lo, r, not(0x00))
-            let lo := mul(r2_lo, r)
-            let hi := add(sub(sub(mm1, lo), lt(mm1, lo)), mul(r2_hi, r))
+            mm := mulmod(r2_lo, r, not(0x00))
+            let r3_lo := mul(r2_lo, r)
+            let r3_hi := add(sub(sub(mm, r3_lo), lt(mm, r3_lo)), mul(r2_hi, r))
 
-            r := add(r, or(lt(hi, x_hi), and(eq(hi, x_hi), lt(lo, x_lo))))
+            r := add(r, or(lt(r3_hi, x_hi), and(eq(r3_hi, x_hi), lt(r3_lo, x_lo))))
         }
     }