diff --git a/Cargo.lock b/Cargo.lock
index 027e2ece..81c26743 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -458,7 +458,7 @@ dependencies = [
  "assert_cmd",
  "cargo_metadata",
  "clap",
- "codspeed 2.8.0-alpha.0",
+ "codspeed",
  "fs_extra",
  "glob",
  "itertools 0.13.0",
@@ -579,18 +579,6 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 
-[[package]]
-name = "codspeed"
-version = "2.7.2"
-source = "git+https://github.com/CodSpeedHQ/codspeed-rust?branch=cod-526-build-and-find-walltime-entrypoint-with-divan#209374e1bc7e49221879f3348a364365992ae065"
-dependencies = [
- "colored",
- "libc",
- "serde",
- "serde_json",
- "uuid",
-]
-
 [[package]]
 name = "codspeed"
 version = "2.8.0-alpha.0"
@@ -608,7 +596,7 @@ name = "codspeed-bencher-compat"
 version = "2.8.0-alpha.0"
 dependencies = [
  "bencher",
- "codspeed 2.8.0-alpha.0",
+ "codspeed",
 ]
 
 [[package]]
@@ -616,7 +604,7 @@ name = "codspeed-criterion-compat"
 version = "2.8.0-alpha.0"
 dependencies = [
  "async-std",
- "codspeed 2.8.0-alpha.0",
+ "codspeed",
  "colored",
  "criterion",
  "futures",
@@ -628,21 +616,35 @@ dependencies = [
 name = "codspeed-divan-compat"
 version = "2.8.0-alpha.0"
 dependencies = [
- "codspeed 2.8.0-alpha.0",
+ "codspeed",
  "codspeed-divan-compat-macros",
- "divan",
+ "codspeed-divan-compat-walltime",
 ]
 
 [[package]]
 name = "codspeed-divan-compat-macros"
 version = "2.8.0-alpha.0"
 dependencies = [
- "divan-macros 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)",
+ "divan-macros",
  "proc-macro2",
  "quote",
  "syn",
 ]
 
+[[package]]
+name = "codspeed-divan-compat-walltime"
+version = "0.1.17"
+dependencies = [
+ "cfg-if",
+ "clap",
+ "codspeed",
+ "condtype",
+ "divan-macros",
+ "libc",
+ "mimalloc",
+ "regex-lite",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.2"
@@ -773,20 +775,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
 
-[[package]]
-name = "divan"
-version = "0.1.17"
-source = "git+https://github.com/CodSpeedHQ/divan#e605bf0c971aeb08bc55867abecc56bafbbdc3a0"
-dependencies = [
- "cfg-if",
- "clap",
- "codspeed 2.7.2",
- "condtype",
- "divan-macros 0.1.17 (git+https://github.com/CodSpeedHQ/divan)",
- "libc",
- "regex-lite",
-]
-
 [[package]]
 name = "divan-macros"
 version = "0.1.17"
@@ -798,16 +786,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "divan-macros"
-version = "0.1.17"
-source = "git+https://github.com/CodSpeedHQ/divan#e605bf0c971aeb08bc55867abecc56bafbbdc3a0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "doc-comment"
 version = "0.3.3"
@@ -1163,6 +1141,16 @@ version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
 
+[[package]]
+name = "libmimalloc-sys"
+version = "0.1.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
+dependencies = [
+ "cc",
+ "libc",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.3.8"
@@ -1200,6 +1188,15 @@ version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
+[[package]]
+name = "mimalloc"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
+dependencies = [
+ "libmimalloc-sys",
+]
+
 [[package]]
 name = "miniz_oxide"
 version = "0.7.4"
diff --git a/Cargo.toml b/Cargo.toml
index a7ca6d1e..4633e963 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
     "crates/cargo-codspeed",
     "crates/divan_compat",
     "crates/divan_compat/macros",
+    "crates/divan_compat/divan_fork",
 ]
 resolver = "2"
 
diff --git a/crates/codspeed/src/walltime.rs b/crates/codspeed/src/walltime.rs
index a3b0e4d5..32517001 100644
--- a/crates/codspeed/src/walltime.rs
+++ b/crates/codspeed/src/walltime.rs
@@ -48,6 +48,8 @@ impl RawWallTimeData {
 }
 
 /// Entry point called in patched integration to harvest raw walltime data
+///
+/// `CODSPEED_CARGO_WORKSPACE_ROOT` is expected to be set for this to work
 pub fn collect_raw_walltime_results(
     scope: &str,
     name: String,
diff --git a/crates/divan_compat/Cargo.toml b/crates/divan_compat/Cargo.toml
index 9c39f67b..226b2bff 100644
--- a/crates/divan_compat/Cargo.toml
+++ b/crates/divan_compat/Cargo.toml
@@ -19,7 +19,7 @@ keywords = ["codspeed", "benchmark", "divan"]
 
 [dependencies]
 codspeed = { path = "../codspeed", version = "=2.8.0-alpha.0" }
-divan = { git = "https://github.com/CodSpeedHQ/divan" }
+divan = { package = "codspeed-divan-compat-walltime", path = "./divan_fork", version = "=0.1.17" }
 codspeed-divan-compat-macros = { version = "=2.8.0-alpha.0", path = './macros' }
 
 [[bench]]
diff --git a/crates/divan_compat/divan_fork/.github/FUNDING.yml b/crates/divan_compat/divan_fork/.github/FUNDING.yml
new file mode 100644
index 00000000..662ce5d1
--- /dev/null
+++ b/crates/divan_compat/divan_fork/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+github: ['nvzqz']
+custom: ['https://paypal.me/nvzqz']
diff --git a/crates/divan_compat/divan_fork/.github/workflows/ci.yml b/crates/divan_compat/divan_fork/.github/workflows/ci.yml
new file mode 100644
index 00000000..5fecd89c
--- /dev/null
+++ b/crates/divan_compat/divan_fork/.github/workflows/ci.yml
@@ -0,0 +1,191 @@
+on: [push, pull_request]
+
+name: CI
+
+env:
+  CARGO_HOME: ${{ github.workspace }}/.cargo
+  CARGO_TERM_COLOR: always
+  RUSTFLAGS: -D warnings -A unused-imports
+  RUSTDOCFLAGS: -D warnings
+  RUST_BACKTRACE: full
+
+jobs:
+  # Check formatting.
+  rustfmt:
+    name: Rustfmt
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: rustup update stable --no-self-update
+      - run: rustc -Vv
+      - run: cargo fmt --all -- --check
+
+  # Build documentation.
+  rustdoc:
+    name: Rustdoc
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: rustdoc-${{ runner.os }}
+      - run: rustup update stable --no-self-update
+      - run: rustc -Vv
+      - run: cargo rustdoc --all-features -- --document-private-items
+
+  # Run linter.
+  clippy:
+    name: Clippy
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: clippy-${{ runner.os }}
+      - run: rustup update stable --no-self-update
+      - run: rustc -Vv
+      - run: cargo clippy --all --all-targets --all-features
+
+  # Run tests in `src/` and `tests/`.
+  unit-test:
+    name: Unit Test
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        rust:
+          - stable
+          - nightly
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: unit-test-${{ runner.os }}-${{ matrix.rust }}
+      - run: rustup default ${{ matrix.rust }}
+      - run: rustup update ${{ matrix.rust }} --no-self-update
+      - run: rustc -Vv
+      - run: cargo test -p divan -p divan-macros
+
+  # Run tests in `src/` and `tests/` using Miri.
+  unit-test-miri:
+    name: Unit Test (Miri)
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: miri-${{ runner.os }}
+      - run: rustup default nightly
+      - run: rustup update nightly --no-self-update
+      - run: rustup component add miri
+      - run: rustc -Vv
+      - run: cargo miri test -p divan -p divan-macros
+
+  # Run `examples/` directory as tests.
+  examples-test:
+    name: Examples Test
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        rust:
+          - stable
+          - nightly
+    env:
+      DIVAN_ITEMS_COUNT: 0
+      DIVAN_BYTES_COUNT: 1
+      DIVAN_CHARS_COUNT: 2
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: examples-test-${{ runner.os }}-${{ matrix.rust }}
+      - run: rustup default ${{ matrix.rust }}
+      - run: rustup update ${{ matrix.rust }} --no-self-update
+      - run: rustc -Vv
+      - run: cargo test -p examples --all-features --benches
+
+  # Run `examples/` directory as benchmarks.
+  examples-bench:
+    name: Examples Bench
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ${{ matrix.os }}
+    env:
+      # Run each benchmark within 2 seconds.
+      DIVAN_MAX_TIME: 2
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: examples-bench-${{ runner.os }}
+      - run: rustup update stable --no-self-update
+      - run: rustc -Vv
+      - run: cargo bench -p examples --all-features
+
+  # Run `internal_benches/` directory as benchmarks.
+  internals-bench:
+    name: Internals Bench
+    if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
+    runs-on: ${{ matrix.os }}
+    env:
+      # Run each benchmark within 2 seconds.
+      DIVAN_MAX_TIME: 2
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v3.3.2
+        with:
+          path: |
+            ${{ env.CARGO_HOME }}
+            target
+          key: internals-bench-${{ runner.os }}
+      - run: rustup update stable --no-self-update
+      - run: rustc -Vv
+      - run: cargo bench -p internal_benches --all-features
diff --git a/crates/divan_compat/divan_fork/.gitignore b/crates/divan_compat/divan_fork/.gitignore
new file mode 100644
index 00000000..8b3cb274
--- /dev/null
+++ b/crates/divan_compat/divan_fork/.gitignore
@@ -0,0 +1,88 @@
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Rust ###
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
diff --git a/crates/divan_compat/divan_fork/CHANGELOG.md b/crates/divan_compat/divan_fork/CHANGELOG.md
new file mode 100644
index 00000000..d3bd2348
--- /dev/null
+++ b/crates/divan_compat/divan_fork/CHANGELOG.md
@@ -0,0 +1,391 @@
+# Changelog [![crates.io][crate-badge]][crate]
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
+and this project adheres to [Semantic
+Versioning](http://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.1.17] - 2024-12-04
+
+### Changed
+
+- Set [MSRV] to 1.80 for [`LazyLock`] and new `size_of` prelude import.
+
+- Reduced thread pool memory usage by many kilobytes by using rendezvous
+  channels instead of array-based channels.
+
+## [0.1.16] - 2024-11-25
+
+### Added
+
+- Thread pool for reusing threads across multi-threaded benchmarks. The result
+  is that when running Divan benchmarks under a sampling profiler, the
+  profiler's output will be cleaner and easier to understand. ([#37])
+
+- Track the maximum number of allocations during a benchmark.
+
+### Changed
+
+- Make private `Arg::get` trait method not take `self`, so that text editors
+  don't recommend using it. ([#59])
+
+- Cache `BenchOptions` using `LazyLock` instead of `OnceLock`, saving space and
+  simplifying the implementation.
+
+## [0.1.15] - 2024-10-31
+
+### Added
+
+- [`CyclesCount`] counter to display cycle throughput as Hertz.
+
+- Track the maximum number of bytes allocated during a benchmark.
+
+### Removed
+
+- Remove `has_cpuid` polyfill due to it no longer being planned for Rust, since
+  CPUID is assumed to be available on all old x86 Rust targets.
+
+### Fixed
+
+- List generic benchmark type parameter `A<4>` before `A<32>`. ([#64])
+
+- Improve precision by using `f64` when calculating allocation count and sizes
+  for the median samples.
+
+- Multi-thread allocation counting in `sum_alloc_tallies` on macOS was loading a
+  null pointer instead of the pointer initialized by `sync_threads`.
+
+### Changes
+
+- Sort all output benchmark names
+  [naturally](https://en.wikipedia.org/wiki/Natural_sort_order) instead of
+  [lexicographically](https://en.wikipedia.org/wiki/Lexicographic_order).
+
+- Internally reuse [`&[&str]` slice][slice] for [`args`] names.
+
+- Subtract overhead of [`AllocProfiler`] from timings. Now that Divan also
+  tracks the maximum bytes allocated, the overhead was apparent in timings.
+
+- Simplify `ThreadAllocInfo::clear`.
+
+- Move measured loop overhead from `SharedContext` to global `OnceLock`.
+
+- Macros no longer rely on `std` being re-exported by Divan. Instead they use
+  `::std` or `::core` to greatly simplify code. Although this is technically a
+  breaking change, it is extremely unlikely to do `extern crate std as x`.
+
+## [0.1.14] - 2024-02-17
+
+### Fixed
+
+- Set correct field in [`Divan::max_time`]. ([#45](https://github.com/nvzqz/divan/pull/45))
+
+### Changes
+
+- Improve [`args`] documentation by relating it to using [`Bencher`].
+
+- Define [`BytesCount::of_iter`] in terms of [`BytesCount::of_many`].
+
+## [0.1.13] - 2024-02-09
+
+### Fixed
+
+- Missing update to `divan-macros` dependency.
+
+## [0.1.12] - 2024-02-09
+
+### Added
+
+- Display [`args`] option values with [`Debug`] instead if [`ToString`] is not
+  implemented.
+
+  This makes it simple to use enums with derived [`Debug`]:
+
+  ```rs
+  #[derive(Debug)]
+  enum Arg { A, B }
+
+  #[divan::bench(args = [Arg::A, Arg::B])]
+  fn bench_args(arg: &Arg) {
+      ...
+  }
+  ```
+
+- Documentation of when to use [`black_box`] in benchmarks.
+
+## [0.1.11] - 2024-01-20
+
+### Fixed
+
+- Sorting negative [`args`] numbers.
+
+## [0.1.10] - 2024-01-20
+
+### Fixed
+
+- Sort [`args`] numbers like [`consts`].
+
+## [0.1.9] - 2024-01-20
+
+### Added
+
+- [`args`] option for providing runtime arguments to benchmarks:
+
+  ```rs
+  #[divan::bench(args = [1, 2, 3])]
+  fn args_list(arg: usize) { ... }
+
+  #[divan::bench(args = 1..=3)]
+  fn args_range(arg: usize) { ... }
+
+  const ARGS: &[usize] = [1, 2, 3];
+
+  #[divan::bench(args = ARGS)]
+  fn args_const(arg: usize) { ... }
+  ```
+
+  This option may be preferred over the similar [`consts`] option because:
+  - It is compatible with more types, only requiring that the argument type
+    implements [`Any`], [`Copy`], [`Send`], [`Sync`], and [`ToString`]. [`Copy`]
+    is not needed if the argument is used through a reference.
+  - It does not increase compile times, unlike [`consts`] which needs to
+    generate new code for each constant used.
+
+## [0.1.8] - 2023-12-19
+
+### Changes
+
+- Reduce [`AllocProfiler`] footprint from 6-10ns to 1-2ns:
+
+  - Thread-local values are now exclusively owned by their threads and are no
+    longer kept in a global list. This enables some optimizations:
+
+    - Performing faster unsynchronized arithmetic.
+
+    - Removing one level of pointer indirection by storing the thread-local
+      value entirely inline in [`thread_local!`], rather than storing a pointer
+      to a globally-shared instance.
+
+    - Compiler emits SIMD arithmetic for x86_64 using `paddq`.
+
+  - Improved thread-local lookup on x86_64 macOS by using a static lookup key
+    instead of a dynamic key from [`pthread_key_create`]. Key 11 is used because
+    it is reserved for Windows.
+
+    The `dyn_thread_local` crate feature disables this optimization. This is
+    recommended if your code or another dependency uses the same static key.
+
+### Fixed
+
+- Remove unused allocations if [`AllocProfiler`] is not active as the global
+  allocator.
+
+## [0.1.7] - 2023-12-13
+
+### Changes
+
+- Improve [`AllocProfiler`] implementation documentation.
+
+- Limit [`AllocProfiler`] mean count outputs to 4 significant digits to not be
+  very wide and for consistency with other outputs.
+
+## [0.1.6] - 2023-12-13
+
+### Added
+
+- [`AllocProfiler`] allocator that tracks allocation counts and sizes during
+  benchmarks.
+
+## [0.1.5] - 2023-12-05
+
+### Added
+
+- [`black_box_drop`](https://docs.rs/divan/0.1.5/divan/fn.black_box_drop.html)
+  convenience function for [`black_box`] + [`drop`]. This is useful when
+  benchmarking a lazy [`Iterator`] to completion with `for_each`:
+
+  ```rust
+  #[divan::bench]
+  fn parse_iter() {
+      let input: &str = // ...
+
+      Parser::new(input)
+          .for_each(divan::black_box_drop);
+  }
+  ```
+
+## [0.1.4] - 2023-12-02
+
+### Added
+
+- `From` implementations for counters on references to `u8`–`u64` and `usize`,
+  such as `From<&u64>` and `From<&&u64>`. This allows for doing:
+
+  ```rust
+  bencher
+      .with_inputs(|| { ... })
+      .input_counter(ItemsCount::from)
+      .bench_values(|n| { ... });
+  ```
+
+- [`Bencher::count_inputs_as<C>`](https://docs.rs/divan/0.1.4/divan/struct.Bencher.html#method.count_inputs_as)
+  method to convert inputs to a `Counter`:
+
+  ```rust
+  bencher
+      .with_inputs(|| -> usize {
+          // ...
+      })
+      .count_inputs_as::<ItemsCount>()
+      .bench_values(|n| -> Vec<usize> {
+          (0..n).collect()
+      });
+  ```
+
+## [0.1.3] - 2023-11-21
+
+### Added
+
+- Convenience shorthand options for `#[divan::bench]` and
+  `#[divan::bench_group]` counters:
+  - [`bytes_count`](https://docs.rs/divan/0.1.3/divan/attr.bench.html#bytes_count)
+    for `counter = BytesCount::from(n)`
+  - [`chars_count`](https://docs.rs/divan/0.1.3/divan/attr.bench.html#chars_count)
+    for `counter = CharsCount::from(n)`
+  - [`items_count`](https://docs.rs/divan/0.1.3/divan/attr.bench.html#items_count)
+    for `counter = ItemsCount::from(n)`
+
+- Support for NetBSD, DragonFly BSD, and Haiku OS by using pre-`main`.
+
+- Set global thread counts using:
+  - [`Divan::threads`](https://docs.rs/divan/0.1.3/divan/struct.Divan.html#method.threads)
+  - `--threads A B C...` CLI arg
+  - `DIVAN_THREADS=A,B,C` env var
+
+  The following example will benchmark across 2, 4, and [available parallelism]
+  thread counts:
+
+  ```sh
+  DIVAN_THREADS=0,2,4 cargo bench -q -p examples --bench atomic
+  ```
+
+- Set global
+  [`Counter`s](https://docs.rs/divan/0.1.3/divan/counter/trait.Counter.html) at
+  runtime using:
+  - [`Divan::counter`](https://docs.rs/divan/0.1.3/divan/struct.Divan.html#method.counter)
+  - [`Divan::items_count`](https://docs.rs/divan/0.1.3/divan/struct.Divan.html#method.items_count)
+  - [`Divan::bytes_count`](https://docs.rs/divan/0.1.3/divan/struct.Divan.html#method.bytes_count)
+  - [`Divan::chars_count`](https://docs.rs/divan/0.1.3/divan/struct.Divan.html#method.chars_count)
+  - `--items-count N` CLI arg
+  - `--bytes-count N` CLI arg
+  - `--chars-count N` CLI arg
+  - `DIVAN_ITEMS_COUNT=N` env var
+  - `DIVAN_BYTES_COUNT=N` env var
+  - `DIVAN_CHARS_COUNT=N` env var
+
+- `From<C>` for
+  [`ItemsCount`](https://docs.rs/divan/0.1.3/divan/counter/struct.ItemsCount.html),
+  [`BytesCount`](https://docs.rs/divan/0.1.3/divan/counter/struct.BytesCount.html),
+  and
+  [`CharsCount`](https://docs.rs/divan/0.1.3/divan/counter/struct.CharsCount.html)
+  where `C` is `u8`–`u64` or `usize` (via `CountUInt` internally). This provides
+  an alternative to the `new` constructor.
+
+- [`BytesCount::of_many`](https://docs.rs/divan/0.1.3/divan/counter/struct.BytesCount.html#method.of_many)
+  method similar to [`BytesCount::of`](https://docs.rs/divan/0.1/divan/counter/struct.BytesCount.html#method.of),
+  but with a parameter by which to multiply the size of the type.
+
+- [`BytesCount::u64`](https://docs.rs/divan/0.1.3/divan/counter/struct.BytesCount.html#method.u64),
+  [`BytesCount::f64`](https://docs.rs/divan/0.1.3/divan/counter/struct.BytesCount.html#method.f64),
+  and similar methods based on [`BytesCount::of_many`](https://docs.rs/divan/0.1.3/divan/counter/struct.BytesCount.html#method.of_many).
+
+### Removed
+
+- [`black_box`] inside benchmark loop when deferring [`Drop`] of outputs. This
+  is now done after the loop.
+
+- [`linkme`](https://docs.rs/linkme) dependency in favor of pre-`main` to
+  register benchmarks and benchmark groups. This is generally be more portable
+  and reliable.
+
+### Changed
+
+- Now calling [`black_box`] at the end of the benchmark loop when deferring use
+  of inputs or [`Drop`] of outputs.
+
+## [0.1.2] - 2023-10-28
+
+### Fixed
+
+- Multi-threaded benchmarks being spread across CPUs, instead of pinning the
+  main thread to CPU 0 and having all threads inherit the main thread's
+  affinity.
+
+## [0.1.1] - 2023-10-25
+
+### Fixed
+
+- Fix using LLD as linker for Linux by using the same pre-`main` approach as
+  Windows.
+
+## 0.1.0 - 2023-10-04
+
+Initial release. See [blog post](https://nikolaivazquez.com/blog/divan/).
+
+[crate]:       https://crates.io/crates/divan
+[crate-badge]: https://img.shields.io/crates/v/divan.svg
+
+[Unreleased]: https://github.com/nvzqz/divan/compare/v0.1.17...HEAD
+[0.1.17]: https://github.com/nvzqz/divan/compare/v0.1.16...v0.1.17
+[0.1.16]: https://github.com/nvzqz/divan/compare/v0.1.15...v0.1.16
+[0.1.15]: https://github.com/nvzqz/divan/compare/v0.1.14...v0.1.15
+[0.1.14]: https://github.com/nvzqz/divan/compare/v0.1.13...v0.1.14
+[0.1.13]: https://github.com/nvzqz/divan/compare/v0.1.12...v0.1.13
+[0.1.12]: https://github.com/nvzqz/divan/compare/v0.1.11...v0.1.12
+[0.1.11]: https://github.com/nvzqz/divan/compare/v0.1.10...v0.1.11
+[0.1.10]: https://github.com/nvzqz/divan/compare/v0.1.9...v0.1.10
+[0.1.9]: https://github.com/nvzqz/divan/compare/v0.1.8...v0.1.9
+[0.1.8]: https://github.com/nvzqz/divan/compare/v0.1.7...v0.1.8
+[0.1.7]: https://github.com/nvzqz/divan/compare/v0.1.6...v0.1.7
+[0.1.6]: https://github.com/nvzqz/divan/compare/v0.1.5...v0.1.6
+[0.1.5]: https://github.com/nvzqz/divan/compare/v0.1.4...v0.1.5
+[0.1.4]: https://github.com/nvzqz/divan/compare/v0.1.3...v0.1.4
+[0.1.3]: https://github.com/nvzqz/divan/compare/v0.1.2...v0.1.3
+[0.1.2]: https://github.com/nvzqz/divan/compare/v0.1.1...v0.1.2
+[0.1.1]: https://github.com/nvzqz/divan/compare/v0.1.0...v0.1.1
+
+[#37]: https://github.com/nvzqz/divan/issues/37
+[#59]: https://github.com/nvzqz/divan/issues/59
+[#64]: https://github.com/nvzqz/divan/issues/64
+
+[`AllocProfiler`]: https://docs.rs/divan/0.1/divan/struct.AllocProfiler.html
+[`args`]: https://docs.rs/divan/latest/divan/attr.bench.html#args
+[`Bencher`]: https://docs.rs/divan/0.1/divan/struct.Bencher.html
+[`black_box`]: https://docs.rs/divan/latest/divan/fn.black_box.html
+[`BytesCount::of_iter`]: https://docs.rs/divan/0.1/divan/counter/struct.BytesCount.html#method.of_iter
+[`BytesCount::of_many`]: https://docs.rs/divan/0.1/divan/counter/struct.BytesCount.html#method.of_many
+[`consts`]: https://docs.rs/divan/latest/divan/attr.bench.html#consts
+[`CyclesCount`]: https://docs.rs/divan/0.1/divan/counter/struct.CyclesCount.html
+[`Divan::max_time`]: https://docs.rs/divan/0.1/divan/struct.Divan.html#method.max_time
+
+[`Any`]: https://doc.rust-lang.org/std/any/trait.Any.html
+[`Copy`]: https://doc.rust-lang.org/std/marker/trait.Copy.html
+[`Debug`]: https://doc.rust-lang.org/std/fmt/trait.Debug.html
+[`drop`]: https://doc.rust-lang.org/std/mem/fn.drop.html
+[`Drop`]: https://doc.rust-lang.org/std/ops/trait.Drop.html
+[`Iterator`]: https://doc.rust-lang.org/std/iter/trait.Iterator.html
+[`LazyLock`]: https://doc.rust-lang.org/std/sync/struct.LazyLock.html
+[`Send`]: https://doc.rust-lang.org/std/marker/trait.Send.html
+[`size_of`]: https://doc.rust-lang.org/std/mem/fn.size_of.html
+[`Sync`]: https://doc.rust-lang.org/std/marker/trait.Sync.html
+[`thread_local!`]: https://doc.rust-lang.org/std/macro.thread_local.html
+[`ToString`]: https://doc.rust-lang.org/std/string/trait.ToString.html
+[available parallelism]: https://doc.rust-lang.org/std/thread/fn.available_parallelism.html
+[slice]: https://doc.rust-lang.org/std/primitive.slice.html
+
+[MSRV]: https://doc.rust-lang.org/cargo/reference/rust-version.html
+
+[`pthread_key_create`]: https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_key_create.html
diff --git a/crates/divan_compat/divan_fork/Cargo.toml b/crates/divan_compat/divan_fork/Cargo.toml
new file mode 100644
index 00000000..1133d559
--- /dev/null
+++ b/crates/divan_compat/divan_fork/Cargo.toml
@@ -0,0 +1,48 @@
+[package]
+name = "codspeed-divan-compat-walltime"
+version = "0.1.17"
+rust-version = "1.80.0"
+edition = "2021"
+authors = ["Nikolai Vazquez"]
+license = "MIT OR Apache-2.0"
+description = "A temporary compatibility layer for CodSpeed to use Divan's walltime entrypoint."
+repository = "https://github.com/nvzqz/divan"
+homepage = "https://github.com/nvzqz/divan"
+documentation = "https://docs.rs/divan"
+categories = ["development-tools::profiling"]
+keywords = ["benchmark", "criterion", "instrument", "measure", "performance"]
+readme = "README.md"
+
+[dependencies]
+divan-macros = { version = "=0.1.17" }
+
+cfg-if = "1"
+clap = { version = "4", default-features = false, features = ["std", "env"] }
+condtype = "1.3"
+regex = { package = "regex-lite", version = "0.1", default-features = false, features = ["std", "string"] }
+codspeed = { path = "../../codspeed", version = "=2.8.0-alpha.0" }
+
+[target.'cfg(unix)'.dependencies]
+libc = "0.2.148"
+
+
+[dev-dependencies]
+mimalloc = "0.1"
+
+[features]
+default = ["wrap_help"]
+help = ["clap/help"]
+wrap_help = ["help", "clap/wrap_help"]
+
+# Opt out of faster static thread-local access and instead always dynamically
+# allocate thread-local storage.
+#
+# On x86_64 macOS we use TLS key 11 (reserved for Windows ABI compatability):
+# https://github.com/apple-oss-distributions/libpthread/blob/libpthread-519/private/pthread/tsd_private.h#L99
+dyn_thread_local = []
+
+# Benchmark internals. Not meant for public use.
+internal_benches = []
+
+[lib]
+doctest = false # Disable doctests for the fork
diff --git a/crates/divan_compat/divan_fork/LICENSE-APACHE b/crates/divan_compat/divan_fork/LICENSE-APACHE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/crates/divan_compat/divan_fork/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/crates/divan_compat/divan_fork/LICENSE-MIT b/crates/divan_compat/divan_fork/LICENSE-MIT
new file mode 100644
index 00000000..8faad18f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/LICENSE-MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Nikolai Vazquez
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/divan_compat/divan_fork/README.md b/crates/divan_compat/divan_fork/README.md
new file mode 100644
index 00000000..2675f8c2
--- /dev/null
+++ b/crates/divan_compat/divan_fork/README.md
@@ -0,0 +1,106 @@
+<div align="center">
+    <h1>Divan</h1>
+    <a href="https://docs.rs/divan">
+        <img src="https://img.shields.io/crates/v/divan.svg?label=docs&color=blue&logo=rust" alt="docs.rs badge">
+    </a>
+    <a href="https://crates.io/crates/divan">
+        <img src="https://img.shields.io/crates/d/divan.svg" alt="Downloads badge">
+    </a>
+    <a href="https://github.com/nvzqz/divan">
+        <img src="https://img.shields.io/github/stars/nvzqz/divan.svg?style=flat&color=black" alt="GitHub stars badge">
+    </a>
+    <a href="https://github.com/nvzqz/divan/actions/workflows/ci.yml">
+        <img src="https://github.com/nvzqz/divan/actions/workflows/ci.yml/badge.svg" alt="CI build status badge">
+    </a>
+    <p>
+        <strong>Comfy bench</strong>marking for Rust projects, brought to you by
+        <a href="https://nikolaivazquez.com">Nikolai Vazquez</a>.
+    </p>
+</div>
+
+## Sponsor
+
+If you or your company find Divan valuable, consider [sponsoring on
+GitHub](https://github.com/sponsors/nvzqz) or [donating via
+PayPal](https://paypal.me/nvzqz). Sponsorships help me progress on what's
+possible with benchmarking in Rust.
+
+## Guide
+
+A guide is being worked on. In the meantime, see:
+- [Announcement post](https://nikolaivazquez.com/blog/divan/)
+- ["Proving Performance" FOSDEM talk](https://youtu.be/P87C4jNakGs)
+
+## Getting Started
+
+Divan `0.1.17` requires Rust `1.80.0` or later.
+
+1. Add the following to your project's [`Cargo.toml`](https://doc.rust-lang.org/cargo/reference/manifest.html):
+
+    ```toml
+    [dev-dependencies]
+    divan = "0.1.17"
+
+    [[bench]]
+    name = "example"
+    harness = false
+    ```
+
+2. Create a benchmarks file at `benches/example.rs`[^1] with your benchmarking code:
+
+    ```rust
+    fn main() {
+        // Run registered benchmarks.
+        divan::main();
+    }
+
+    // Register a `fibonacci` function and benchmark it over multiple cases.
+    #[divan::bench(args = [1, 2, 4, 8, 16, 32])]
+    fn fibonacci(n: u64) -> u64 {
+        if n <= 1 {
+            1
+        } else {
+            fibonacci(n - 2) + fibonacci(n - 1)
+        }
+    }
+    ```
+
+3. Run your benchmarks with [`cargo bench`](https://doc.rust-lang.org/cargo/commands/cargo-bench.html):
+
+    ```txt
+    example       fastest  │ slowest  │ median   │ mean     │ samples │ iters
+    ╰─ fibonacci           │          │          │          │         │
+       ├─ 1       0.626 ns │ 1.735 ns │ 0.657 ns │ 0.672 ns │ 100     │ 819200
+       ├─ 2       2.767 ns │ 3.154 ns │ 2.788 ns │ 2.851 ns │ 100     │ 204800
+       ├─ 4       6.816 ns │ 7.671 ns │ 7.061 ns │ 7.167 ns │ 100     │ 102400
+       ├─ 8       57.31 ns │ 62.51 ns │ 57.96 ns │ 58.55 ns │ 100     │ 12800
+       ├─ 16      2.874 µs │ 3.812 µs │ 2.916 µs │ 3.006 µs │ 100     │ 200
+       ╰─ 32      6.267 ms │ 6.954 ms │ 6.283 ms │ 6.344 ms │ 100     │ 100
+    ```
+
+See [`#[divan::bench]`][bench_attr] for info on benchmark function registration.
+
+## Examples
+
+Practical example benchmarks can be found in the [`examples/benches`](https://github.com/nvzqz/divan/tree/main/examples/benches)
+directory. These can be benchmarked locally by running:
+
+```sh
+git clone https://github.com/nvzqz/divan.git
+cd divan
+
+cargo bench -q -p examples --all-features
+```
+
+More thorough usage examples can be found in the [`#[divan::bench]` documentation][bench_attr_examples].
+
+## License
+
+Like the Rust project, this library may be used under either the
+[MIT License](https://github.com/nvzqz/divan/blob/main/LICENSE-MIT) or
+[Apache License (Version 2.0)](https://github.com/nvzqz/divan/blob/main/LICENSE-APACHE).
+
+[^1]: Within your crate directory, i.e. [`$CARGO_MANIFEST_DIR`](https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-crates)
+
+[bench_attr]: https://docs.rs/divan/latest/divan/attr.bench.html
+[bench_attr_examples]: https://docs.rs/divan/latest/divan/attr.bench.html#examples
diff --git a/crates/divan_compat/divan_fork/WANTED.md b/crates/divan_compat/divan_fork/WANTED.md
new file mode 100644
index 00000000..eef56682
--- /dev/null
+++ b/crates/divan_compat/divan_fork/WANTED.md
@@ -0,0 +1,47 @@
+# Wanted
+
+It would be great to have the following features added to Divan. If you have
+ideas to expand this list, please [find](https://github.com/nvzqz/divan/discussions)
+or [create](https://github.com/nvzqz/divan/discussions/new?category=ideas) a
+discussion first.
+
+- Async benchmarks
+
+- Baseline benchmark
+    - Should match baselines across equal generic types and constants
+    - Idea:
+    ```rs
+    #[divan::bench]
+    fn old() { ... }
+
+    #[divan::bench(baseline = old)]
+    fn new() { ... }
+    ```
+
+- Cross-device: run benchmarks on other devices and report the data on the local
+device
+
+- HTML output
+
+- CSV output
+
+- Custom counters
+
+- Time complexity of counters
+    - Also space complexity when measuring heap allocation
+
+- Measure heap allocations
+    - Custom [`GlobalAlloc`](https://doc.rust-lang.org/std/alloc/trait.GlobalAlloc.html)
+    that wraps another `GlobalAlloc`, defaulting to [`System`](https://doc.rust-lang.org/std/alloc/struct.System.html)
+
+- Custom timers
+
+- Timer for kernel/user mode
+    - Unix:
+        - [`getrusage(2)`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/getrusage.html)
+        - Per-thread:
+            - Linux/FreeBSD/OpenBSD: [`RUSAGE_THREAD`](https://man7.org/linux/man-pages/man2/getrusage.2.html)
+            - macOS/iOS: [`thread_info(mach_thread_self(), ...)`](https://www.gnu.org/software/hurd/gnumach-doc/Thread-Information.html)
+    - Windows:
+        - [`GetProcessTimes`](https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getprocesstimes)
+        - [`GetThreadTimes`](https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadtimes)
diff --git a/crates/divan_compat/divan_fork/examples/Cargo.toml b/crates/divan_compat/divan_fork/examples/Cargo.toml
new file mode 100644
index 00000000..028895a3
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/Cargo.toml
@@ -0,0 +1,109 @@
+[package]
+name = "examples"
+version = "0.0.0"
+edition = "2021"
+authors = ["Nikolai Vazquez"]
+license = "MIT OR Apache-2.0"
+description = "Examples for Divan, a comfy benchmarking framework."
+readme = "../README.md"
+publish = false
+
+[dependencies]
+divan = { workspace = true }
+fastrand = "2"
+image = { version = "0.24", optional = true }
+libc = "0.2.147"
+rayon = "1"
+
+# Search
+ordsearch = "0.2.5"
+wyhash = "0.5"
+
+# Hash
+blake3 = { version = "1.4", optional = true, features = ["rayon"] }
+digest = { version = "*", optional = true }
+fnv = { version = "1", optional = true }
+highway = { version = "1.1", optional = true }
+metrohash = { version = "1", optional = true }
+seahash = { version = "4.1", optional = true }
+sha1 = { version = "0.10", optional = true }
+sha2 = { version = "0.10", optional = true }
+sha3 = { version = "0.10", optional = true }
+twox-hash = { version = "1.6", optional = true }
+
+[target.'cfg(unix)'.dependencies]
+libc = { workspace = true }
+
+[target.'cfg(target_os = "macos")'.dependencies]
+mach2 = "0.4"
+
+[target.'cfg(any(windows, target_os = "linux", target_os = "android"))'.dependencies]
+winapi = { version = "0.3.9", features = ["processthreadsapi"] }
+
+[features]
+hash = [
+    "blake3",
+    "digest",
+    "fnv",
+    "highway",
+    "metrohash",
+    "seahash",
+    "sha1",
+    "sha2",
+    "sha3",
+    "twox-hash",
+]
+
+[[bench]]
+name = "atomic"
+harness = false
+
+[[bench]]
+name = "collections"
+harness = false
+
+[[bench]]
+name = "hash"
+harness = false
+required-features = ["hash"]
+
+[[bench]]
+name = "image"
+harness = false
+required-features = ["image"]
+
+[[bench]]
+name = "math"
+harness = false
+
+[[bench]]
+name = "memcpy"
+harness = false
+
+[[bench]]
+name = "panic"
+harness = false
+
+[[bench]]
+name = "scratch"
+harness = false
+
+[[bench]]
+name = "search"
+harness = false
+
+[[bench]]
+name = "sort"
+harness = false
+
+[[bench]]
+name = "string"
+harness = false
+
+[[bench]]
+name = "threads"
+harness = false
+
+[[bench]]
+name = "time"
+harness = false
diff --git a/crates/divan_compat/divan_fork/examples/README.md b/crates/divan_compat/divan_fork/examples/README.md
new file mode 100644
index 00000000..0508fe95
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/README.md
@@ -0,0 +1,13 @@
+# Divan Examples
+
+Practical example benchmarks can be found in the [`examples/benches`](https://github.com/nvzqz/divan/tree/main/examples/benches)
+directory. These can be benchmarked locally by running:
+
+```sh
+git clone https://github.com/nvzqz/divan.git
+cd divan
+
+cargo bench -q -p examples --all-features
+```
+
+More thorough usage examples can be found in the [`#[divan::bench]` documentation](https://docs.rs/divan/latest/divan/attr.bench.html#examples).
diff --git a/crates/divan_compat/divan_fork/examples/benches/README.md b/crates/divan_compat/divan_fork/examples/benches/README.md
new file mode 100644
index 00000000..0508fe95
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/README.md
@@ -0,0 +1,13 @@
+# Divan Examples
+
+Practical example benchmarks can be found in the [`examples/benches`](https://github.com/nvzqz/divan/tree/main/examples/benches)
+directory. These can be benchmarked locally by running:
+
+```sh
+git clone https://github.com/nvzqz/divan.git
+cd divan
+
+cargo bench -q -p examples --all-features
+```
+
+More thorough usage examples can be found in the [`#[divan::bench]` documentation](https://docs.rs/divan/latest/divan/attr.bench.html#examples).
diff --git a/crates/divan_compat/divan_fork/examples/benches/atomic.rs b/crates/divan_compat/divan_fork/examples/benches/atomic.rs
new file mode 100644
index 00000000..973ec1bb
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/atomic.rs
@@ -0,0 +1,135 @@
+use std::sync::atomic::*;
+
+use divan::black_box;
+
+fn main() {
+    divan::main();
+}
+
+// Available parallelism (0), baseline (1), and common CPU core counts.
+const THREADS: &[usize] = &[0, 1, 4, 16];
+
+#[divan::bench_group(threads = THREADS)]
+mod basic {
+    use super::*;
+
+    #[divan::bench]
+    fn load() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).load(Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn store() {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).store(black_box(2), Ordering::Relaxed);
+    }
+}
+
+#[divan::bench_group(threads = THREADS)]
+mod update {
+    use super::*;
+
+    #[divan::bench]
+    fn fetch_or() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_or(black_box(1), Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn fetch_and() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_and(black_box(1), Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn fetch_xor() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_xor(black_box(1), Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn fetch_nand() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_nand(black_box(1), Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn fetch_add() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_add(black_box(1), Ordering::Relaxed)
+    }
+
+    #[divan::bench]
+    fn fetch_sub() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        black_box(&N).fetch_sub(black_box(1), Ordering::Relaxed)
+    }
+}
+
+#[divan::bench_group(threads = THREADS)]
+mod compare_exchange {
+    use super::*;
+
+    #[divan::bench]
+    fn fetch_mul() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        let mut current = black_box(&N).load(Ordering::Relaxed);
+        loop {
+            match black_box(&N).compare_exchange(
+                current,
+                current.wrapping_mul(black_box(2)),
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => return current,
+                Err(n) => current = n,
+            }
+        }
+    }
+
+    #[divan::bench]
+    fn fetch_div() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        let mut current = black_box(&N).load(Ordering::Relaxed);
+        loop {
+            match black_box(&N).compare_exchange(
+                current,
+                current.wrapping_div(black_box(2)),
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => return current,
+                Err(n) => current = n,
+            }
+        }
+    }
+
+    #[divan::bench]
+    fn fetch_mod() -> usize {
+        static N: AtomicUsize = AtomicUsize::new(1);
+
+        let mut current = black_box(&N).load(Ordering::Relaxed);
+        loop {
+            match black_box(&N).compare_exchange(
+                current,
+                current % black_box(2),
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => return current,
+                Err(n) => current = n,
+            }
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/collections.rs b/crates/divan_compat/divan_fork/examples/benches/collections.rs
new file mode 100644
index 00000000..87797146
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/collections.rs
@@ -0,0 +1,161 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench collections
+//! ```
+
+use divan::{black_box, AllocProfiler, Bencher};
+use std::collections::{BTreeSet, BinaryHeap, HashSet, LinkedList, VecDeque};
+
+pub fn collect_nums<T: FromIterator<i32>>(n: usize) -> T {
+    black_box(0..(n as i32)).collect()
+}
+
+pub trait WithCapacity {
+    fn with_capacity(c: usize) -> Self;
+}
+
+pub trait Clear {
+    fn clear(&mut self);
+}
+
+pub trait PopFront<T> {
+    fn pop_front(&mut self) -> Option<T>;
+}
+
+impl<T> PopFront<T> for Vec<T> {
+    fn pop_front(&mut self) -> Option<T> {
+        if self.is_empty() {
+            None
+        } else {
+            Some(self.remove(0))
+        }
+    }
+}
+
+impl<T> PopFront<T> for VecDeque<T> {
+    fn pop_front(&mut self) -> Option<T> {
+        self.pop_front()
+    }
+}
+
+impl<T> PopFront<T> for LinkedList<T> {
+    fn pop_front(&mut self) -> Option<T> {
+        self.pop_front()
+    }
+}
+
+macro_rules! impl_with_capacity {
+    ($($t:ident),+) => {
+        $(impl WithCapacity for $t<i32> {
+            fn with_capacity(c: usize) -> Self {
+                $t::with_capacity(c)
+            }
+        })+
+    };
+}
+
+macro_rules! impl_clear {
+    ($($t:ident),+) => {
+        $(impl Clear for $t<i32> {
+            fn clear(&mut self) {
+                $t::clear(self);
+            }
+        })+
+    };
+}
+
+impl_with_capacity!(Vec, VecDeque, BinaryHeap, HashSet);
+impl_clear!(Vec, VecDeque, BinaryHeap, HashSet, LinkedList, BTreeSet);
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[0, 8, 64, 1024];
+
+#[divan::bench(types = [
+    Vec<i32>,
+    VecDeque<i32>,
+    LinkedList<i32>,
+    BinaryHeap<i32>,
+    HashSet<i32>,
+    BTreeSet<i32>,
+])]
+fn default<T: Default>() -> T {
+    T::default()
+}
+
+#[divan::bench(
+    types = [
+        Vec<i32>,
+        VecDeque<i32>,
+        BinaryHeap<i32>,
+        HashSet<i32>,
+    ],
+    args = LENS,
+)]
+fn with_capacity<T: WithCapacity>(bencher: Bencher, len: usize) {
+    bencher.counter(len).bench(|| T::with_capacity(len))
+}
+
+#[divan::bench(
+    types = [
+        Vec<i32>,
+        VecDeque<i32>,
+        LinkedList<i32>,
+        BinaryHeap<i32>,
+        HashSet<i32>,
+        BTreeSet<i32>,
+    ],
+    args = LENS,
+)]
+fn from_iter<T: FromIterator<i32>>(bencher: Bencher, len: usize) {
+    bencher.counter(len).bench(|| collect_nums::<T>(len))
+}
+
+#[divan::bench(
+    types = [
+        Vec<i32>,
+        VecDeque<i32>,
+        LinkedList<i32>,
+        BinaryHeap<i32>,
+        HashSet<i32>,
+        BTreeSet<i32>,
+    ],
+    args = LENS,
+)]
+fn drop<T: FromIterator<i32>>(bencher: Bencher, len: usize) {
+    bencher.counter(len).with_inputs(|| collect_nums::<T>(len)).bench_values(std::mem::drop);
+}
+
+#[divan::bench(
+    types = [
+        Vec<i32>,
+        VecDeque<i32>,
+        LinkedList<i32>,
+        BinaryHeap<i32>,
+        HashSet<i32>,
+        BTreeSet<i32>,
+    ],
+    args = LENS,
+    max_time = 1,
+)]
+fn clear<T: FromIterator<i32> + Clear>(bencher: Bencher, len: usize) {
+    bencher.counter(len).with_inputs(|| collect_nums::<T>(len)).bench_refs(T::clear);
+}
+
+#[divan::bench(
+    types = [
+        Vec<i32>,
+        VecDeque<i32>,
+        LinkedList<i32>,
+    ],
+    args = LENS,
+)]
+fn pop_front<T: FromIterator<i32> + PopFront<i32>>(bencher: Bencher, len: usize) {
+    bencher.counter(len).with_inputs(|| collect_nums::<T>(len)).bench_refs(T::pop_front);
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/hash.rs b/crates/divan_compat/divan_fork/examples/benches/hash.rs
new file mode 100644
index 00000000..e595c625
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/hash.rs
@@ -0,0 +1,135 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench hash --features hash
+//! ```
+
+use digest::Digest;
+use divan::AllocProfiler;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+struct Blake3;
+struct Blake3Par;
+struct Sha1;
+struct Sha2_256;
+struct Sha2_512;
+struct Sha3_256;
+struct Sha3_512;
+
+/// [`Hasher::write`] + [`Hasher::finish`].
+#[divan::bench(
+    types = [
+        Blake3,
+        Blake3Par,
+        fnv::FnvHasher,
+        highway::HighwayHasher,
+        metrohash::MetroHash128,
+        metrohash::MetroHash64,
+        seahash::SeaHasher,
+        Sha1,
+        Sha2_256,
+        Sha2_512,
+        Sha3_256,
+        Sha3_512,
+        std::collections::hash_map::DefaultHasher,
+        twox_hash::XxHash32,
+        twox_hash::XxHash64,
+        wyhash::WyHash,
+    ],
+    args = [0, 8, 64, 1024, 1024 * 1024],
+    max_time = 1,
+)]
+fn hash<H>(bencher: divan::Bencher, len: usize)
+where
+    H: Hasher,
+{
+    let bytes: Vec<u8> = {
+        let mut rng = fastrand::Rng::new();
+        (0..len).map(|_| rng.u8(..)).collect()
+    };
+
+    bencher
+        .counter(divan::counter::BytesCount::new(len))
+        .with_inputs(|| bytes.clone())
+        .bench_refs(|bytes| H::hash(bytes));
+}
+
+trait Hasher {
+    type Hash;
+
+    fn hash(bytes: &[u8]) -> Self::Hash;
+}
+
+impl<H: std::hash::Hasher + Default> Hasher for H {
+    type Hash = u64;
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        let mut hasher = H::default();
+        hasher.write(bytes);
+        hasher.finish()
+    }
+}
+
+impl Hasher for Blake3 {
+    type Hash = [u8; 32];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        *blake3::hash(bytes).as_bytes()
+    }
+}
+
+impl Hasher for Blake3Par {
+    type Hash = [u8; 32];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        let mut hasher = blake3::Hasher::new();
+        hasher.update_rayon(bytes);
+        *hasher.finalize().as_bytes()
+    }
+}
+
+impl Hasher for Sha1 {
+    type Hash = [u8; 20];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        sha1::Sha1::new_with_prefix(bytes).finalize().into()
+    }
+}
+
+impl Hasher for Sha2_256 {
+    type Hash = [u8; 32];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        sha2::Sha256::new_with_prefix(bytes).finalize().into()
+    }
+}
+
+impl Hasher for Sha2_512 {
+    type Hash = [u8; 64];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        sha2::Sha512::new_with_prefix(bytes).finalize().into()
+    }
+}
+
+impl Hasher for Sha3_256 {
+    type Hash = [u8; 32];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        sha3::Sha3_256::new_with_prefix(bytes).finalize().into()
+    }
+}
+
+impl Hasher for Sha3_512 {
+    type Hash = [u8; 64];
+
+    fn hash(bytes: &[u8]) -> Self::Hash {
+        sha3::Sha3_512::new_with_prefix(bytes).finalize().into()
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/image.rs b/crates/divan_compat/divan_fork/examples/benches/image.rs
new file mode 100644
index 00000000..4fca23b0
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/image.rs
@@ -0,0 +1,43 @@
+//! Benchmarks the [`image`](https://docs.rs/image) crate.
+//!
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench image --features image
+//! ```
+
+use divan::{black_box, counter::BytesCount, AllocProfiler, Bencher};
+use image::{GenericImage, ImageBuffer, Rgba};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+fn make_image(pixel: Rgba<u8>) -> ImageBuffer<Rgba<u8>, Vec<u8>> {
+    ImageBuffer::from_pixel(2048, 2048, pixel)
+}
+
+// https://github.com/image-rs/image/blob/v0.24.6/benches/copy_from.rs
+#[divan::bench(max_time = 1)]
+fn copy_from(bencher: Bencher) {
+    let src = make_image(Rgba([255u8, 0, 0, 255]));
+    let mut dst = make_image(Rgba([0u8, 0, 0, 255]));
+
+    bencher
+        .counter(BytesCount::of_slice(&*src))
+        .bench_local(|| black_box(&mut dst).copy_from(black_box(&src), 0, 0));
+}
+
+/// Baseline for `copy_from`.
+#[divan::bench(max_time = 1)]
+fn memcpy(bencher: Bencher) {
+    let src = make_image(Rgba([255u8, 0, 0, 255]));
+    let mut dst = vec![0; src.len()];
+
+    bencher
+        .counter(BytesCount::of_slice(&*src))
+        .bench_local(|| black_box(&mut dst).copy_from_slice(black_box(&src)));
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/math.rs b/crates/divan_compat/divan_fork/examples/benches/math.rs
new file mode 100644
index 00000000..ef3f2897
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/math.rs
@@ -0,0 +1,118 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench math
+//! ```
+
+use divan::black_box;
+use std::collections::{BTreeMap, HashMap};
+
+fn main() {
+    divan::main();
+}
+
+#[divan::bench]
+fn add() -> i32 {
+    black_box(2) + black_box(1)
+}
+
+#[divan::bench]
+#[ignore]
+fn sub() -> i32 {
+    black_box(2) - black_box(1)
+}
+
+#[divan::bench]
+fn mul() -> i32 {
+    black_box(2) * black_box(1)
+}
+
+#[divan::bench]
+fn div() -> i32 {
+    black_box(2) / black_box(1)
+}
+
+#[divan::bench]
+fn rem() -> i32 {
+    black_box(2) % black_box(1)
+}
+
+// 1, 1, 2, 3, 5, ...
+mod fibonacci {
+    use super::*;
+
+    const VALUES: &[u64] = &[0, 5, 10, 20, 30, 40];
+
+    // O(n)
+    #[divan::bench(args = VALUES)]
+    fn iterative(n: u64) -> u64 {
+        let mut previous = 1;
+        let mut current = 1;
+
+        for _ in 2..=n {
+            let next = previous + current;
+            previous = current;
+            current = next;
+        }
+
+        current
+    }
+
+    // O(2^n)
+    #[divan::bench(args = VALUES, max_time = 1)]
+    fn recursive(n: u64) -> u64 {
+        if n <= 1 {
+            1
+        } else {
+            recursive(n - 2) + recursive(n - 1)
+        }
+    }
+
+    trait Map: Default {
+        fn get(&self, key: u64) -> Option<u64>;
+        fn set(&mut self, key: u64, value: u64);
+    }
+
+    impl Map for HashMap<u64, u64> {
+        fn get(&self, key: u64) -> Option<u64> {
+            self.get(&key).copied()
+        }
+
+        fn set(&mut self, key: u64, value: u64) {
+            self.insert(key, value);
+        }
+    }
+
+    impl Map for BTreeMap<u64, u64> {
+        fn get(&self, key: u64) -> Option<u64> {
+            self.get(&key).copied()
+        }
+
+        fn set(&mut self, key: u64, value: u64) {
+            self.insert(key, value);
+        }
+    }
+
+    // O(n)
+    #[divan::bench(
+        types = [BTreeMap<u64, u64>, HashMap<u64, u64>],
+        args = VALUES,
+    )]
+    fn recursive_memoized<M: Map>(n: u64) -> u64 {
+        fn fibonacci<M: Map>(n: u64, cache: &mut M) -> u64 {
+            if let Some(result) = cache.get(n) {
+                return result;
+            }
+
+            if n <= 1 {
+                return 1;
+            }
+
+            let result = fibonacci(n - 2, cache) + fibonacci(n - 1, cache);
+            cache.set(n, result);
+            result
+        }
+
+        fibonacci(n, &mut M::default())
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/memcpy.rs b/crates/divan_compat/divan_fork/examples/benches/memcpy.rs
new file mode 100644
index 00000000..d665d792
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/memcpy.rs
@@ -0,0 +1,107 @@
+use divan::{counter::BytesCount, Bencher};
+use fastrand::Rng;
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[
+    1,
+    2,
+    8,
+    16,
+    64,
+    512,
+    1024 * 4,
+    1024 * 16,
+    1024 * 64,
+    1024 * 256,
+    1024 * 1024,
+    1024 * 1024 * 4,
+];
+
+#[divan::bench(args = LENS)]
+fn memcpy(bencher: Bencher, len: usize) {
+    bencher.counter(BytesCount::new(len)).with_inputs(Input::gen(len)).bench_local_refs(
+        |input| unsafe {
+            let src_ptr = input.src_ptr();
+            let dst_ptr = input.dst_ptr();
+            libc::memcpy(dst_ptr.cast(), src_ptr.cast(), len);
+        },
+    )
+}
+
+#[divan::bench(args = LENS)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn movsb(bencher: Bencher, len: usize) {
+    use std::arch::asm;
+
+    bencher.counter(BytesCount::new(len)).with_inputs(Input::gen(len)).bench_local_refs(
+        |input| unsafe {
+            #[cfg(target_arch = "x86")]
+            asm!(
+                "rep movsb",
+                inout("ecx") len => _,
+                inout("esi") input.src_ptr() => _,
+                inout("edi") input.dst_ptr() => _,
+                options(nostack, preserves_flags),
+            );
+
+            #[cfg(target_arch = "x86_64")]
+            asm!(
+                "rep movsb",
+                inout("rcx") len => _,
+                inout("rsi") input.src_ptr() => _,
+                inout("rdi") input.dst_ptr() => _,
+                options(nostack, preserves_flags),
+            );
+        },
+    )
+}
+
+/// Self-referential input.
+///
+/// It stores random offsets into the buffers, which are allowed to reference up
+/// to the provided length. This enables us to benchmark unaligned writes. We
+/// generate these as part of the input to not add benchmark time.
+struct Input {
+    src_buf: Box<[u8]>,
+    dst_buf: Box<[u8]>,
+    src_offset: usize,
+    dst_offset: usize,
+}
+
+impl Input {
+    fn gen(len: usize) -> impl FnMut() -> Self {
+        let mut rng = Rng::default();
+        move || {
+            // Very buffers by length rather than adhere to nice numbers.
+            let max_len = len + (len / 8);
+
+            let src_len = rng.usize(len..=max_len);
+            let dst_len = rng.usize(len..=max_len);
+
+            let src_buf: Box<[u8]> = (0..src_len).map(|_| rng.u8(..)).collect();
+            let dst_buf: Box<[u8]> = (0..dst_len).map(|_| rng.u8(..)).collect();
+
+            // 50% chance of the copy being aligned. Aligned writes are
+            // potentially must faster.
+            let is_aligned = rng.bool();
+            let (src_offset, dst_offset) = if is_aligned {
+                (0, 0)
+            } else {
+                (rng.usize(..=src_len - len), rng.usize(..=dst_len - len))
+            };
+
+            Input { src_buf, dst_buf, src_offset, dst_offset }
+        }
+    }
+
+    fn src_ptr(&self) -> *const u8 {
+        self.src_buf.as_ptr().wrapping_add(self.src_offset)
+    }
+
+    fn dst_ptr(&mut self) -> *mut u8 {
+        self.dst_buf.as_mut_ptr().wrapping_add(self.dst_offset)
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/panic.rs b/crates/divan_compat/divan_fork/examples/benches/panic.rs
new file mode 100644
index 00000000..078fc47f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/panic.rs
@@ -0,0 +1,64 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench panic
+//! ```
+
+use std::panic;
+
+use divan::{black_box, black_box_drop, AllocProfiler};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    // Silence panics.
+    panic::set_hook(Box::new(|_| {}));
+
+    divan::main();
+}
+
+// Available parallelism (0), baseline (1), and common CPU core counts.
+const THREADS: &[usize] = &[0, 1, 4, 16];
+
+#[divan::bench]
+#[track_caller]
+fn caller_location() -> &'static panic::Location<'static> {
+    panic::Location::caller()
+}
+
+#[divan::bench_group(threads = THREADS)]
+mod hook {
+    use super::*;
+
+    #[divan::bench]
+    fn set() {
+        panic::set_hook(Box::new(|_| {}));
+    }
+
+    #[divan::bench]
+    fn take() -> impl Drop {
+        panic::take_hook()
+    }
+
+    #[divan::bench]
+    fn take_and_drop() {
+        black_box_drop(panic::take_hook());
+    }
+}
+
+mod catch_unwind {
+    use super::*;
+
+    #[divan::bench]
+    fn panic() -> std::thread::Result<()> {
+        let panic: fn() = || panic!();
+        panic::catch_unwind(black_box(panic))
+    }
+
+    #[divan::bench]
+    fn success() -> std::thread::Result<()> {
+        let success: fn() = || {};
+        panic::catch_unwind(black_box(success))
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/scratch.rs b/crates/divan_compat/divan_fork/examples/benches/scratch.rs
new file mode 100644
index 00000000..f8e1aa3c
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/scratch.rs
@@ -0,0 +1,15 @@
+//! Scratch space for benchmarks.
+//!
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench scratch
+//! ```
+
+// Uncomment the code below to measure heap allocations.
+// #[global_allocator]
+// static ALLOC: divan::AllocProfiler = divan::AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/search.rs b/crates/divan_compat/divan_fork/examples/benches/search.rs
new file mode 100644
index 00000000..14e3c106
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/search.rs
@@ -0,0 +1,125 @@
+use std::{
+    collections::{hash_map::RandomState, BTreeSet, HashSet},
+    hash::BuildHasher,
+};
+
+use divan::{black_box_drop, AllocProfiler, Bencher};
+use fastrand::Rng;
+use ordsearch::OrderedCollection;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::Divan::from_args()
+        .items_count(
+            // Every benchmark iteration searches for a single element.
+            1u32,
+        )
+        .main();
+}
+
+const SIZES: &[usize] =
+    &[1, 2, 8, 16, 64, 512, 4 * 1024, 16 * 1024, 64 * 1024, 256 * 1024, 1024 * 1024];
+
+fn gen_inputs(len: usize) -> impl FnMut() -> (Vec<u64>, u64) {
+    let mut rng = Rng::with_seed(len as u64);
+
+    move || {
+        let haystack: Vec<u64> = {
+            // Use `BTreeSet` to ensure result is sorted and has `len` items.
+            let mut haystack = BTreeSet::new();
+
+            for _ in 0..len {
+                while !haystack.insert(rng.u64(..)) {}
+            }
+
+            haystack.into_iter().collect()
+        };
+
+        let has_needle = rng.bool();
+        let needle = if has_needle {
+            *rng.choice(&haystack).unwrap()
+        } else {
+            loop {
+                let n = rng.u64(..);
+                if !haystack.contains(&n) {
+                    break n;
+                }
+            }
+        };
+
+        assert_eq!(haystack.len(), len);
+        (haystack, needle)
+    }
+}
+
+#[divan::bench(args = SIZES, max_time = 1)]
+fn linear(bencher: Bencher, len: usize) {
+    bencher
+        .with_inputs(gen_inputs(len))
+        .bench_local_refs(|(haystack, needle)| haystack.iter().find(|v| **v == *needle).copied())
+}
+
+#[divan::bench(args = SIZES, max_time = 1)]
+fn binary(bencher: Bencher, len: usize) {
+    bencher
+        .with_inputs(gen_inputs(len))
+        .bench_local_refs(|(haystack, needle)| haystack.binary_search_by(|v| v.cmp(needle)))
+}
+
+#[divan::bench(args = SIZES, max_time = 1)]
+fn btree_set(bencher: Bencher, len: usize) {
+    let mut gen_inputs = gen_inputs(len);
+
+    bencher
+        .with_inputs(|| -> (BTreeSet<u64>, u64) {
+            let (haystack, needle) = gen_inputs();
+            (haystack.into_iter().collect(), needle)
+        })
+        .bench_local_refs(|(haystack, needle)| haystack.get(needle).copied())
+}
+
+/// Local implementation instead of `BuildHasherDefault` to get shorter name in
+/// output.
+#[derive(Default)]
+struct WyHash;
+
+impl BuildHasher for WyHash {
+    type Hasher = wyhash::WyHash;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        wyhash::WyHash::default()
+    }
+}
+
+#[divan::bench(
+    args = SIZES,
+    max_time = 1,
+    types = [RandomState, WyHash],
+)]
+fn hash_set<H>(bencher: Bencher, len: usize)
+where
+    H: BuildHasher + Default,
+{
+    let mut gen_inputs = gen_inputs(len);
+
+    bencher
+        .with_inputs(|| -> (HashSet<u64, H>, u64) {
+            let (haystack, needle) = gen_inputs();
+            (haystack.into_iter().collect(), needle)
+        })
+        .bench_local_refs(|(haystack, needle)| haystack.get(needle).copied())
+}
+
+#[divan::bench(args = SIZES, max_time = 1)]
+fn ordsearch(bencher: Bencher, len: usize) {
+    let mut gen_inputs = gen_inputs(len);
+
+    bencher
+        .with_inputs(|| {
+            let (haystack, needle) = gen_inputs();
+            (OrderedCollection::from_sorted_iter(haystack), needle)
+        })
+        .bench_local_refs(|(haystack, needle)| black_box_drop(haystack.find_gte(*needle)))
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/sort.rs b/crates/divan_compat/divan_fork/examples/benches/sort.rs
new file mode 100644
index 00000000..e1d799d4
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/sort.rs
@@ -0,0 +1,88 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench sort
+//! ```
+
+use divan::{AllocProfiler, Bencher};
+use rayon::slice::ParallelSliceMut;
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+/// Functions that generate deterministic values.
+mod gen {
+    pub const LEN: usize = 100_000;
+
+    pub fn rand_int_generator() -> impl FnMut() -> i32 {
+        let mut rng = fastrand::Rng::with_seed(42);
+        move || rng.i32(..)
+    }
+
+    pub fn rand_int_vec_generator() -> impl FnMut() -> Vec<i32> {
+        let mut rand_int_generator = rand_int_generator();
+        move || (0..LEN).map(|_| rand_int_generator()).collect()
+    }
+
+    pub fn sorted_int_vec_generator() -> impl FnMut() -> Vec<i32> {
+        move || (0..LEN).map(|i| i as i32).collect()
+    }
+}
+
+mod random {
+    use super::*;
+
+    #[divan::bench]
+    fn sort(bencher: Bencher) {
+        bencher.with_inputs(gen::rand_int_vec_generator()).bench_local_refs(|v| v.sort());
+    }
+
+    #[divan::bench]
+    fn sort_unstable(bencher: Bencher) {
+        bencher.with_inputs(gen::rand_int_vec_generator()).bench_local_refs(|v| v.sort_unstable());
+    }
+
+    #[divan::bench]
+    fn par_sort(bencher: Bencher) {
+        bencher.with_inputs(gen::rand_int_vec_generator()).bench_local_refs(|v| v.par_sort());
+    }
+
+    #[divan::bench]
+    fn par_sort_unstable(bencher: Bencher) {
+        bencher
+            .with_inputs(gen::rand_int_vec_generator())
+            .bench_local_refs(|v| v.par_sort_unstable());
+    }
+}
+
+mod sorted {
+    use super::*;
+
+    #[divan::bench]
+    fn sort(bencher: Bencher) {
+        bencher.with_inputs(gen::sorted_int_vec_generator()).bench_local_refs(|v| v.sort());
+    }
+
+    #[divan::bench]
+    fn sort_unstable(bencher: Bencher) {
+        bencher
+            .with_inputs(gen::sorted_int_vec_generator())
+            .bench_local_refs(|v| v.sort_unstable());
+    }
+
+    #[divan::bench]
+    fn par_sort(bencher: Bencher) {
+        bencher.with_inputs(gen::sorted_int_vec_generator()).bench_local_refs(|v| v.par_sort());
+    }
+
+    #[divan::bench]
+    fn par_sort_unstable(bencher: Bencher) {
+        bencher
+            .with_inputs(gen::sorted_int_vec_generator())
+            .bench_local_refs(|v| v.par_sort_unstable());
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/string.rs b/crates/divan_compat/divan_fork/examples/benches/string.rs
new file mode 100644
index 00000000..19956d2b
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/string.rs
@@ -0,0 +1,180 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench string
+//! ```
+
+use divan::{
+    black_box, black_box_drop,
+    counter::{BytesCount, CharsCount},
+    AllocProfiler, Bencher,
+};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+const LENS: &[usize] = &[0, 8, 64, 1024];
+
+#[derive(Default)]
+struct Ascii {
+    rng: fastrand::Rng,
+}
+
+#[derive(Default)]
+struct Unicode {
+    rng: fastrand::Rng,
+}
+
+trait GenString: Default {
+    fn gen_string(&mut self, char_count: usize) -> String;
+}
+
+impl GenString for Ascii {
+    fn gen_string(&mut self, char_count: usize) -> String {
+        (0..char_count).map(|_| self.rng.alphanumeric()).collect()
+    }
+}
+
+impl GenString for Unicode {
+    fn gen_string(&mut self, char_count: usize) -> String {
+        (0..char_count).map(|_| self.rng.char(..)).collect()
+    }
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+    max_time = 1,
+)]
+fn clear<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(String::clear);
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn drop<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_values(std::mem::drop);
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn validate_utf8<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| {
+            let bytes = black_box(s.as_bytes());
+            black_box_drop(std::str::from_utf8(bytes));
+        });
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn char_count<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.chars().count());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn make_ascii_lowercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.make_ascii_lowercase());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn make_ascii_uppercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.make_ascii_uppercase());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn to_ascii_lowercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.to_ascii_lowercase());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn to_ascii_uppercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.to_ascii_uppercase());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn to_lowercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.to_lowercase());
+}
+
+#[divan::bench(
+    types = [Ascii, Unicode],
+    args = LENS,
+)]
+fn to_uppercase<G: GenString>(bencher: Bencher, len: usize) {
+    let mut gen = G::default();
+    bencher
+        .counter(CharsCount::new(len))
+        .with_inputs(|| gen.gen_string(len))
+        .input_counter(BytesCount::of_str)
+        .bench_local_refs(|s| s.to_uppercase());
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/threads.rs b/crates/divan_compat/divan_fork/examples/benches/threads.rs
new file mode 100644
index 00000000..cf41a0ba
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/threads.rs
@@ -0,0 +1,390 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench threads
+//! ```
+
+use std::{
+    cell::UnsafeCell,
+    sync::{
+        atomic::{AtomicUsize, Ordering::Relaxed},
+        Arc, Mutex, RwLock,
+    },
+    thread::{Thread, ThreadId},
+};
+
+use divan::{black_box, black_box_drop, AllocProfiler, Bencher};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+// Available parallelism (0), baseline (1), and common CPU core counts.
+const THREADS: &[usize] = &[0, 1, 4, 16];
+
+#[divan::bench_group(threads = THREADS)]
+mod arc {
+    use super::*;
+
+    #[divan::bench]
+    fn clone(bencher: Bencher) {
+        let arc = Arc::new(42);
+        bencher.bench(|| arc.clone());
+    }
+
+    #[divan::bench]
+    fn drop(bencher: Bencher) {
+        let arc = Arc::new(42);
+        bencher.with_inputs(|| arc.clone()).bench_values(std::mem::drop);
+    }
+
+    #[divan::bench]
+    fn get_mut(bencher: Bencher) {
+        let arc = Arc::new(42);
+
+        bencher.with_inputs(|| arc.clone()).bench_refs(|arc| {
+            // Black box the branched value to ensure a branch gets emitted.
+            // This more closely simulates `Arc::get_mut` usage in practice.
+            if let Some(val) = Arc::get_mut(arc) {
+                black_box_drop(val);
+            }
+        });
+    }
+}
+
+#[divan::bench_group(threads = THREADS)]
+mod mutex {
+    use super::*;
+
+    mod lock {
+        use super::*;
+
+        #[divan::bench]
+        fn block() {
+            static M: Mutex<u64> = Mutex::new(0);
+            black_box_drop(M.lock());
+        }
+
+        #[divan::bench]
+        fn r#try() {
+            static M: Mutex<u64> = Mutex::new(0);
+            black_box_drop(M.try_lock());
+        }
+    }
+
+    mod set {
+        use super::*;
+
+        #[divan::bench]
+        fn block() {
+            static M: Mutex<u64> = Mutex::new(0);
+            *black_box(M.lock().unwrap()) = black_box(42);
+        }
+
+        #[divan::bench]
+        fn r#try() {
+            static M: Mutex<u64> = Mutex::new(0);
+
+            if let Ok(lock) = M.try_lock() {
+                *black_box(lock) = black_box(42);
+            }
+        }
+    }
+}
+
+#[divan::bench_group(threads = THREADS)]
+mod rw_lock {
+    use super::*;
+
+    mod read {
+        use super::*;
+
+        #[divan::bench]
+        fn block() {
+            static L: RwLock<u64> = RwLock::new(0);
+            black_box_drop(L.read());
+        }
+
+        #[divan::bench]
+        fn r#try() {
+            static L: RwLock<u64> = RwLock::new(0);
+            black_box_drop(L.try_read());
+        }
+    }
+
+    mod write {
+        use super::*;
+
+        #[divan::bench]
+        fn block() {
+            static L: RwLock<u64> = RwLock::new(0);
+            black_box_drop(L.write());
+        }
+
+        #[divan::bench]
+        fn r#try() {
+            static L: RwLock<u64> = RwLock::new(0);
+            black_box_drop(L.try_write());
+        }
+    }
+
+    mod set {
+        use super::*;
+
+        #[divan::bench]
+        fn block() {
+            static L: RwLock<u64> = RwLock::new(0);
+            *black_box(L.write().unwrap()) = black_box(42);
+        }
+
+        #[divan::bench]
+        fn r#try() {
+            static L: RwLock<u64> = RwLock::new(0);
+
+            if let Ok(lock) = L.try_write() {
+                *black_box(lock) = black_box(42);
+            }
+        }
+    }
+}
+
+/// Benchmark getting an integer or pointer uniquely identifying the current
+/// thread or core.
+#[divan::bench_group(threads = THREADS)]
+mod thread_id {
+    use super::*;
+
+    #[divan::bench_group(name = "std")]
+    mod stdlib {
+        use super::*;
+
+        mod thread_local {
+            use super::*;
+
+            #[divan::bench]
+            fn count() -> usize {
+                static SHARED: AtomicUsize = AtomicUsize::new(0);
+
+                thread_local! {
+                    static LOCAL: usize = SHARED.fetch_add(1, Relaxed);
+                }
+
+                LOCAL.with(|count| *count)
+            }
+
+            #[divan::bench]
+            fn id() -> ThreadId {
+                thread_local! {
+                    static LOCAL: ThreadId = std::thread::current().id();
+                }
+
+                LOCAL.with(|id| *id)
+            }
+
+            #[divan::bench]
+            fn ptr() -> *mut u8 {
+                thread_local! {
+                    static LOCAL: UnsafeCell<u8> = const { UnsafeCell::new(0) };
+                }
+
+                LOCAL.with(|addr| addr.get())
+            }
+        }
+
+        mod thread {
+            use super::*;
+
+            #[divan::bench]
+            fn current() -> Thread {
+                std::thread::current()
+            }
+
+            #[divan::bench]
+            fn current_id() -> ThreadId {
+                std::thread::current().id()
+            }
+        }
+    }
+
+    #[cfg(unix)]
+    mod pthread {
+        use super::*;
+
+        // https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_self.html
+        #[divan::bench(name = "self")]
+        fn this() -> libc::pthread_t {
+            unsafe { libc::pthread_self() }
+        }
+
+        #[divan::bench]
+        fn getspecific(bencher: Bencher) {
+            unsafe {
+                let mut key: libc::pthread_key_t = 0;
+                loop {
+                    match libc::pthread_key_create(&mut key, None) {
+                        0 => break,
+                        libc::EAGAIN => continue,
+                        error => panic!("{}", std::io::Error::from_raw_os_error(error)),
+                    }
+                }
+
+                bencher.bench(|| libc::pthread_getspecific(key));
+
+                libc::pthread_key_delete(key);
+            };
+        }
+
+        #[cfg(target_os = "macos")]
+        #[divan::bench]
+        fn get_stackaddr_np() -> *mut libc::c_void {
+            unsafe { libc::pthread_get_stackaddr_np(libc::pthread_self()) }
+        }
+
+        #[cfg(target_os = "macos")]
+        #[divan::bench]
+        fn threadid_np() -> u64 {
+            unsafe {
+                let mut tid = 0;
+                libc::pthread_threadid_np(libc::pthread_self(), &mut tid);
+                tid
+            }
+        }
+
+        #[cfg(target_os = "macos")]
+        #[divan::bench]
+        fn cpu_number_np() -> usize {
+            unsafe {
+                let mut cpu = 0;
+                libc::pthread_cpu_number_np(&mut cpu);
+                cpu
+            }
+        }
+    }
+
+    // https://www.gnu.org/software/hurd/gnumach-doc/Thread-Information.html
+    #[cfg(target_os = "macos")]
+    #[divan::bench]
+    fn mach_thread_self() -> impl Drop {
+        struct Thread(mach2::mach_types::thread_port_t);
+
+        impl Drop for Thread {
+            fn drop(&mut self) {
+                unsafe {
+                    mach2::mach_port::mach_port_deallocate(mach2::traps::mach_task_self(), self.0);
+                }
+            }
+        }
+
+        Thread(unsafe { mach2::mach_init::mach_thread_self() })
+    }
+
+    // https://man7.org/linux/man-pages/man2/gettid.2.html
+    #[cfg(target_os = "linux")]
+    #[divan::bench]
+    fn gettid() -> libc::pid_t {
+        unsafe { libc::gettid() }
+    }
+
+    // https://man7.org/linux/man-pages/man3/sched_getcpu.3.html
+    #[cfg(target_os = "linux")]
+    #[divan::bench]
+    fn sched_getcpu() -> libc::c_int {
+        unsafe { libc::sched_getcpu() }
+    }
+
+    #[cfg(windows)]
+    #[divan::bench]
+    #[allow(non_snake_case)]
+    fn GetCurrentProcessorNumber() -> u32 {
+        unsafe { winapi::um::processthreadsapi::GetCurrentProcessorNumber() }
+    }
+
+    #[cfg(windows)]
+    #[divan::bench]
+    #[allow(non_snake_case)]
+    fn GetCurrentProcessorNumberEx() -> (u16, u8) {
+        unsafe {
+            let mut result = std::mem::zeroed();
+            winapi::um::processthreadsapi::GetCurrentProcessorNumberEx(&mut result);
+            (result.Group, result.Number)
+        }
+    }
+
+    #[cfg(windows)]
+    #[divan::bench]
+    #[allow(non_snake_case)]
+    fn GetCurrentThread() -> std::os::windows::io::RawHandle {
+        unsafe { winapi::um::processthreadsapi::GetCurrentThread().cast() }
+    }
+
+    #[cfg(windows)]
+    #[divan::bench]
+    #[allow(non_snake_case)]
+    fn GetCurrentThreadId() -> u32 {
+        unsafe { winapi::um::processthreadsapi::GetCurrentThreadId() }
+    }
+
+    #[cfg(windows)]
+    #[divan::bench]
+    #[allow(non_snake_case)]
+    fn TlsGetValue(bencher: Bencher) {
+        unsafe {
+            use winapi::um::processthreadsapi::*;
+
+            let tls_index = TlsAlloc();
+            if tls_index == TLS_OUT_OF_INDEXES {
+                panic!("{}", std::io::Error::last_os_error());
+            }
+
+            bencher.bench(|| TlsGetValue(tls_index));
+
+            TlsFree(tls_index);
+        }
+    }
+
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "aarch64"),
+        any(target_os = "linux", target_os = "macos", target_os = "windows"),
+    ))]
+    #[divan::bench]
+    fn asm() -> usize {
+        unsafe {
+            let result: usize;
+
+            #[cfg(all(target_arch = "x86_64", any(target_os = "macos", target_os = "windows")))]
+            std::arch::asm!(
+                "mov {}, gs",
+                out(reg) result,
+                options(nostack, nomem, preserves_flags)
+            );
+
+            #[cfg(all(target_arch = "x86_64", target_os = "linux"))]
+            std::arch::asm!(
+                "mov {}, fs",
+                out(reg) result,
+                options(nostack, nomem, preserves_flags)
+            );
+
+            // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/TPIDRRO-EL0--EL0-Read-Only-Software-Thread-ID-Register?lang=en
+            #[cfg(all(target_arch = "aarch64", any(target_os = "macos", target_os = "windows")))]
+            std::arch::asm!(
+                "mrs {}, tpidrro_el0",
+                out(reg) result,
+                options(nostack, nomem, preserves_flags)
+            );
+
+            // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/TPIDR-EL0--EL0-Read-Write-Software-Thread-ID-Register?lang=en
+            #[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+            std::arch::asm!(
+                "mrs {}, tpidr_el0",
+                out(reg) result,
+                options(nostack, nomem, preserves_flags)
+            );
+
+            result
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/examples/benches/time.rs b/crates/divan_compat/divan_fork/examples/benches/time.rs
new file mode 100644
index 00000000..dc595974
--- /dev/null
+++ b/crates/divan_compat/divan_fork/examples/benches/time.rs
@@ -0,0 +1,103 @@
+//! Run with:
+//!
+//! ```sh
+//! cargo bench -q -p examples --bench time
+//! ```
+
+use std::time::{Instant, SystemTime};
+
+use divan::{AllocProfiler, Bencher};
+
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
+
+mod now {
+    use super::*;
+
+    #[divan::bench]
+    fn instant() -> Instant {
+        Instant::now()
+    }
+
+    #[divan::bench]
+    fn system_time() -> SystemTime {
+        SystemTime::now()
+    }
+
+    #[divan::bench(name = if cfg!(target_arch = "aarch64") {
+        "tsc (aarch64)"
+    } else {
+        "tsc (x86)"
+    })]
+    #[cfg(all(
+        not(miri),
+        any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"),
+    ))]
+    pub fn tsc() -> u64 {
+        #[cfg(target_arch = "aarch64")]
+        unsafe {
+            let timestamp: u64;
+            std::arch::asm!(
+                "mrs {}, cntvct_el0",
+                out(reg) timestamp,
+                // Leave off `nomem` because this should be a compiler fence.
+                options(nostack, preserves_flags),
+            );
+            timestamp
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        unsafe {
+            #[cfg(target_arch = "x86")]
+            use std::arch::x86;
+            #[cfg(target_arch = "x86_64")]
+            use std::arch::x86_64 as x86;
+
+            x86::_rdtsc()
+        }
+    }
+}
+
+mod duration_since {
+    use super::*;
+
+    #[divan::bench]
+    fn instant(bencher: Bencher) {
+        bencher
+            .with_inputs(|| [Instant::now(), Instant::now()])
+            .bench_values(|[start, end]| end.duration_since(start));
+    }
+
+    #[divan::bench]
+    fn system_time(bencher: Bencher) {
+        bencher
+            .with_inputs(|| [SystemTime::now(), SystemTime::now()])
+            .bench_values(|[start, end]| end.duration_since(start));
+    }
+
+    #[divan::bench(name = if cfg!(target_arch = "aarch64") {
+        "tsc (aarch64)"
+    } else {
+        "tsc (x86)"
+    })]
+    #[cfg(all(
+        not(miri),
+        any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"),
+    ))]
+    fn tsc(bencher: Bencher) {
+        bencher.with_inputs(|| [crate::now::tsc(), crate::now::tsc()]).bench_values(
+            |[start, end]| {
+                // Simply subtract because an optimized timing implementation
+                // would want to keep the value as TSC units for as long as
+                // possible before dividing by the TSC frequency.
+                //
+                // Saturating arithmetic to ensures monotonicity.
+                end.saturating_sub(start)
+            },
+        )
+    }
+}
diff --git a/crates/divan_compat/divan_fork/internal_benches/Cargo.toml b/crates/divan_compat/divan_fork/internal_benches/Cargo.toml
new file mode 100644
index 00000000..a1463ceb
--- /dev/null
+++ b/crates/divan_compat/divan_fork/internal_benches/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "internal_benches"
+version = "0.0.0"
+edition = "2021"
+authors = ["Nikolai Vazquez"]
+license = "MIT OR Apache-2.0"
+description = "Internal benchmarks for Divan, a comfy benchmarking framework."
+readme = "../README.md"
+publish = false
+
+[dependencies]
+divan = { workspace = true, features = ["internal_benches"] }
+
+[[bench]]
+name = "internals"
+harness = false
diff --git a/crates/divan_compat/divan_fork/internal_benches/README.md b/crates/divan_compat/divan_fork/internal_benches/README.md
new file mode 100644
index 00000000..b4e4e9f7
--- /dev/null
+++ b/crates/divan_compat/divan_fork/internal_benches/README.md
@@ -0,0 +1,27 @@
+# Divan Internal Benchmarks
+
+This crate demonstrates how to use [Divan] to benchmark internals of a crate by
+benchmarking the internals of Divan.
+
+These can be benchmarked locally by running:
+
+```sh
+git clone https://github.com/nvzqz/divan.git
+cd divan
+
+cargo bench -q -p internal_benches
+```
+
+As of this writing, the output on my machine is:
+
+```txt
+divan                             fastest  │ slowest  │ median   │ mean     │ samples │ iters
+╰─ time                                    │          │          │          │         │
+   ╰─ timer                                │          │          │          │         │
+      ├─ get_tsc                  0.158 ns │ 0.202 ns │ 0.161 ns │ 0.162 ns │ 100     │ 1638400
+      ╰─ measure                           │          │          │          │         │
+         ├─ precision             89.58 µs │ 221.5 µs │ 201.9 µs │ 184.5 µs │ 100     │ 100
+         ╰─ sample_loop_overhead  314.2 µs │ 342.5 µs │ 314.5 µs │ 317.1 µs │ 100     │ 100
+```
+
+[divan]: https://github.com/nvzqz/divan
diff --git a/crates/divan_compat/divan_fork/internal_benches/benches/internals.rs b/crates/divan_compat/divan_fork/internal_benches/benches/internals.rs
new file mode 100644
index 00000000..37d0a1a7
--- /dev/null
+++ b/crates/divan_compat/divan_fork/internal_benches/benches/internals.rs
@@ -0,0 +1,8 @@
+use divan::AllocProfiler;
+
+#[global_allocator]
+static GLOBAL_ALLOC: AllocProfiler = AllocProfiler::system();
+
+fn main() {
+    divan::main();
+}
diff --git a/crates/divan_compat/divan_fork/macros/Cargo.toml b/crates/divan_compat/divan_fork/macros/Cargo.toml
new file mode 100644
index 00000000..87087a0e
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "divan-macros"
+version = "0.1.17"
+edition = "2021"
+authors = ["Nikolai Vazquez"]
+license = "MIT OR Apache-2.0"
+description = "Macros for Divan, a statistically-comfy benchmarking library."
+repository = "https://github.com/nvzqz/divan"
+homepage = "https://github.com/nvzqz/divan"
+documentation = "https://docs.rs/divan-macros"
+categories = ["development-tools::profiling"]
+keywords = ["benchmark", "criterion", "instrument", "measure", "performance"]
+readme = "../README.md"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+proc-macro2 = "1"
+quote = { version = "1", default-features = false }
+# Versions prior to *.18 fail to parse empty attribute metadata.
+syn = { version = "^2.0.18", default-features = false, features = ["full", "clone-impls", "parsing", "printing", "proc-macro"] }
+
+[dev-dependencies]
+divan = { workspace = true }
diff --git a/crates/divan_compat/divan_fork/macros/LICENSE-APACHE b/crates/divan_compat/divan_fork/macros/LICENSE-APACHE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/crates/divan_compat/divan_fork/macros/LICENSE-MIT b/crates/divan_compat/divan_fork/macros/LICENSE-MIT
new file mode 100644
index 00000000..8faad18f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/LICENSE-MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Nikolai Vazquez
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/crates/divan_compat/divan_fork/macros/src/attr_options.rs b/crates/divan_compat/divan_fork/macros/src/attr_options.rs
new file mode 100644
index 00000000..af2c6184
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/src/attr_options.rs
@@ -0,0 +1,378 @@
+use proc_macro::TokenStream;
+use quote::{quote, ToTokens};
+use syn::{
+    parse::{Parse, Parser},
+    spanned::Spanned,
+    Expr, ExprArray, Ident, Token, Type,
+};
+
+use crate::{tokens, Macro};
+
+/// Values from parsed options shared between `#[divan::bench]` and
+/// `#[divan::bench_group]`.
+///
+/// The `crate` option is not included because it is only needed to get proper
+/// access to `__private`.
+pub(crate) struct AttrOptions {
+    /// `divan::__private`.
+    pub private_mod: proc_macro2::TokenStream,
+
+    /// Custom name for the benchmark or group.
+    pub name_expr: Option<Expr>,
+
+    /// `IntoIterator` from which to provide runtime arguments.
+    pub args_expr: Option<Expr>,
+
+    /// Options for generic functions.
+    pub generic: GenericOptions,
+
+    /// The `BenchOptions.counters` field and its value, followed by a comma.
+    pub counters: proc_macro2::TokenStream,
+
+    /// Options used directly as `BenchOptions` fields.
+    ///
+    /// Option reuse is handled by the compiler ensuring `BenchOptions` fields
+    /// are not repeated.
+    pub bench_options: Vec<(Ident, Expr)>,
+}
+
+impl AttrOptions {
+    pub fn parse(tokens: TokenStream, target_macro: Macro) -> Result<Self, TokenStream> {
+        let macro_name = target_macro.name();
+
+        let mut divan_crate = None::<syn::Path>;
+        let mut name_expr = None::<Expr>;
+        let mut args_expr = None::<Expr>;
+        let mut bench_options = Vec::new();
+
+        let mut counters = Vec::<(proc_macro2::TokenStream, Option<&str>)>::new();
+        let mut counters_ident = None::<Ident>;
+
+        let mut seen_bytes_count = false;
+        let mut seen_chars_count = false;
+        let mut seen_cycles_count = false;
+        let mut seen_items_count = false;
+
+        let mut generic = GenericOptions::default();
+
+        let attr_parser = syn::meta::parser(|meta| {
+            macro_rules! error {
+                ($($t:tt)+) => {
+                    return Err(meta.error(format_args!($($t)+)))
+                };
+            }
+
+            let Some(ident) = meta.path.get_ident() else {
+                error!("unsupported '{macro_name}' option");
+            };
+
+            let ident_name = ident.to_string();
+            let ident_name = ident_name.strip_prefix("r#").unwrap_or(&ident_name);
+
+            let repeat_error = || error!("repeated '{macro_name}' option '{ident_name}'");
+            let unsupported_error = || error!("unsupported '{macro_name}' option '{ident_name}'");
+
+            macro_rules! parse {
+                ($storage:expr) => {
+                    if $storage.is_none() {
+                        $storage = Some(meta.value()?.parse()?);
+                    } else {
+                        return repeat_error();
+                    }
+                };
+            }
+
+            match ident_name {
+                "crate" => parse!(divan_crate),
+                "name" => parse!(name_expr),
+                "types" => {
+                    match target_macro {
+                        Macro::Bench { fn_sig } => {
+                            if fn_sig.generics.type_params().next().is_none() {
+                                error!("generic type required for '{macro_name}' option '{ident_name}'");
+                            }
+                        }
+                        _ => return unsupported_error(),
+                    }
+
+                    parse!(generic.types);
+                }
+                "consts" => {
+                    match target_macro {
+                        Macro::Bench { fn_sig } => {
+                            if fn_sig.generics.const_params().next().is_none() {
+                                error!("generic const required for '{macro_name}' option '{ident_name}'");
+                            }
+                        }
+                        _ => return unsupported_error(),
+                    }
+
+                    parse!(generic.consts);
+                }
+                "args" => {
+                    match target_macro {
+                        Macro::Bench { fn_sig } => {
+                            if !matches!(fn_sig.inputs.len(), 1 | 2) {
+                                return Err(meta.error(format_args!("function argument required for '{macro_name}' option '{ident_name}'")));
+                            }
+                        }
+                        _ => return unsupported_error(),
+                    }
+
+                    parse!(args_expr);
+                }
+                "counter" => {
+                    if counters_ident.is_some() {
+                        return repeat_error();
+                    }
+                    let value: Expr = meta.value()?.parse()?;
+                    counters.push((value.into_token_stream(), None));
+                    counters_ident = Some(Ident::new("counters", ident.span()));
+                }
+                "counters" => {
+                    if counters_ident.is_some() {
+                        return repeat_error();
+                    }
+                    let values: ExprArray = meta.value()?.parse()?;
+                    counters.extend(
+                        values.elems.into_iter().map(|elem| (elem.into_token_stream(), None)),
+                    );
+                    counters_ident = Some(ident.clone());
+                }
+
+                "bytes_count" if seen_bytes_count => return repeat_error(),
+                "chars_count" if seen_chars_count => return repeat_error(),
+                "cycles_count" if seen_cycles_count => return repeat_error(),
+                "items_count" if seen_items_count => return repeat_error(),
+
+                "bytes_count" | "chars_count" | "cycles_count" | "items_count" => {
+                    let name = match ident_name {
+                        "bytes_count" => {
+                            seen_bytes_count = true;
+                            "BytesCount"
+                        }
+                        "chars_count" => {
+                            seen_chars_count = true;
+                            "CharsCount"
+                        }
+                        "cycles_count" => {
+                            seen_cycles_count = true;
+                            "CyclesCount"
+                        }
+                        "items_count" => {
+                            seen_items_count = true;
+                            "ItemsCount"
+                        }
+                        _ => unreachable!(),
+                    };
+
+                    let value: Expr = meta.value()?.parse()?;
+                    counters.push((value.into_token_stream(), Some(name)));
+                    counters_ident = Some(Ident::new("counters", proc_macro2::Span::call_site()));
+                }
+
+                _ => {
+                    let value: Expr = match meta.value() {
+                        Ok(value) => value.parse()?,
+
+                        // If the option is missing `=`, use a `true` literal.
+                        Err(_) => Expr::Lit(syn::ExprLit {
+                            lit: syn::LitBool::new(true, meta.path.span()).into(),
+                            attrs: Vec::new(),
+                        }),
+                    };
+
+                    bench_options.push((ident.clone(), value));
+                }
+            }
+
+            Ok(())
+        });
+
+        match attr_parser.parse(tokens) {
+            Ok(()) => {}
+            Err(error) => return Err(error.into_compile_error().into()),
+        }
+
+        let divan_crate = divan_crate.unwrap_or_else(|| syn::parse_quote!(::divan));
+        let private_mod = quote! { #divan_crate::__private };
+
+        let counters = counters.iter().map(|(expr, type_name)| match type_name {
+            Some(type_name) => {
+                let type_name = Ident::new(type_name, proc_macro2::Span::call_site());
+                quote! {
+                    // We do a scoped import for the expression to override any
+                    // local `From` trait.
+                    {
+                        use ::std::convert::From as _;
+
+                        #divan_crate::counter::#type_name::from(#expr)
+                    }
+                }
+            }
+            None => expr.to_token_stream(),
+        });
+
+        let counters = counters_ident
+            .map(|ident| {
+                quote! {
+                    #ident: #private_mod::new_counter_set() #(.with(#counters))* ,
+                }
+            })
+            .unwrap_or_default();
+
+        Ok(Self { private_mod, name_expr, args_expr, generic, counters, bench_options })
+    }
+
+    /// Produces a function expression for creating `LazyLock<BenchOptions>`.
+    ///
+    /// If the `#[ignore]` attribute is specified, this be provided its
+    /// identifier to set `BenchOptions` using its span. Doing this instead of
+    /// creating the `ignore` identifier ourselves improves compiler error
+    /// diagnostics.
+    pub fn bench_options_fn(
+        &self,
+        ignore_attr_ident: Option<&syn::Path>,
+    ) -> proc_macro2::TokenStream {
+        fn is_lit_array(expr: &Expr) -> bool {
+            let Expr::Array(expr) = expr else {
+                return false;
+            };
+            expr.elems.iter().all(|elem| matches!(elem, Expr::Lit { .. }))
+        }
+
+        let private_mod = &self.private_mod;
+        let option_some = tokens::option_some();
+
+        // Directly set fields on `BenchOptions`. This simplifies things by:
+        // - Having a single source of truth
+        // - Making unknown options a compile error
+        //
+        // We use `..` (struct update syntax) to ensure that no option is set
+        // twice, even if raw identifiers are used. This also has the accidental
+        // benefit of Rust Analyzer recognizing fields and emitting suggestions
+        // with docs and type info.
+        if self.bench_options.is_empty() && self.counters.is_empty() && ignore_attr_ident.is_none()
+        {
+            tokens::option_none()
+        } else {
+            let options_iter = self.bench_options.iter().map(|(option, value)| {
+                let option_name = option.to_string();
+                let option_name = option_name.strip_prefix("r#").unwrap_or(&option_name);
+
+                let wrapped_value: proc_macro2::TokenStream;
+                let value: &dyn ToTokens = match option_name {
+                    "threads" => {
+                        wrapped_value = if is_lit_array(value) {
+                            // If array of literals, just use `&[...]`.
+                            quote! { ::std::borrow::Cow::Borrowed(&#value) }
+                        } else {
+                            quote! { #private_mod::IntoThreads::into_threads(#value) }
+                        };
+
+                        &wrapped_value
+                    }
+
+                    // If the option is a `Duration`, use `IntoDuration` to be
+                    // polymorphic over `Duration` or `u64`/`f64` seconds.
+                    "min_time" | "max_time" => {
+                        wrapped_value =
+                            quote! { #private_mod::IntoDuration::into_duration(#value) };
+                        &wrapped_value
+                    }
+
+                    _ => value,
+                };
+
+                quote! { #option: #option_some(#value), }
+            });
+
+            let ignore = match ignore_attr_ident {
+                Some(ignore_attr_ident) => quote! { #ignore_attr_ident: #option_some(true), },
+                None => Default::default(),
+            };
+
+            let counters = &self.counters;
+
+            quote! {
+                #option_some(::std::sync::LazyLock::new(|| {
+                    #[allow(clippy::needless_update)]
+                    #private_mod::BenchOptions {
+                        #(#options_iter)*
+
+                        // Ignore comes after options so that options take
+                        // priority in compiler error diagnostics.
+                        #ignore
+
+                        #counters
+
+                        ..::std::default::Default::default()
+                    }
+                }))
+            }
+        }
+    }
+}
+
+/// Options for generic functions.
+#[derive(Default)]
+pub struct GenericOptions {
+    /// Generic types over which to instantiate benchmark functions.
+    pub types: Option<GenericTypes>,
+
+    /// `const` array/slice over which to instantiate benchmark functions.
+    pub consts: Option<Expr>,
+}
+
+impl GenericOptions {
+    /// Returns `true` if set exclusively to either:
+    /// - `types = []`
+    /// - `consts = []`
+    pub fn is_empty(&self) -> bool {
+        match (&self.types, &self.consts) {
+            (Some(types), None) => types.is_empty(),
+            (None, Some(Expr::Array(consts))) => consts.elems.is_empty(),
+            _ => false,
+        }
+    }
+
+    /// Returns an iterator of multiple `Some` for types, or a single `None` if
+    /// there are no types.
+    pub fn types_iter(&self) -> Box<dyn Iterator<Item = Option<&dyn ToTokens>> + '_> {
+        match &self.types {
+            None => Box::new(std::iter::once(None)),
+            Some(GenericTypes::List(types)) => {
+                Box::new(types.iter().map(|t| Some(t as &dyn ToTokens)))
+            }
+        }
+    }
+}
+
+/// Generic types over which to instantiate benchmark functions.
+pub enum GenericTypes {
+    /// List of types, e.g. `[i32, String, ()]`.
+    List(Vec<proc_macro2::TokenStream>),
+}
+
+impl Parse for GenericTypes {
+    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
+        let content;
+        syn::bracketed!(content in input);
+
+        Ok(Self::List(
+            content
+                .parse_terminated(Type::parse, Token![,])?
+                .into_iter()
+                .map(|ty| ty.into_token_stream())
+                .collect(),
+        ))
+    }
+}
+
+impl GenericTypes {
+    pub fn is_empty(&self) -> bool {
+        match self {
+            Self::List(list) => list.is_empty(),
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/macros/src/lib.rs b/crates/divan_compat/divan_fork/macros/src/lib.rs
new file mode 100644
index 00000000..304775c4
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/src/lib.rs
@@ -0,0 +1,610 @@
+//! Macros for [Divan](https://github.com/nvzqz/divan), a statistically-comfy
+//! benchmarking library brought to you by [Nikolai Vazquez](https://hachyderm.io/@nikolai).
+//!
+//! See [`divan`](https://docs.rs/divan) crate for documentation.
+
+use proc_macro::TokenStream;
+use quote::{quote, ToTokens};
+
+mod attr_options;
+mod tokens;
+
+use attr_options::*;
+use syn::{Expr, FnArg};
+
+#[derive(Clone, Copy)]
+enum Macro<'a> {
+    Bench { fn_sig: &'a syn::Signature },
+    BenchGroup,
+}
+
+impl Macro<'_> {
+    fn name(&self) -> &'static str {
+        match self {
+            Self::Bench { .. } => "bench",
+            Self::BenchGroup => "bench_group",
+        }
+    }
+}
+
+/// Lists of comma-separated `#[cfg]` parameters.
+mod systems {
+    use super::*;
+
+    pub fn elf() -> proc_macro2::TokenStream {
+        quote! {
+            target_os = "android",
+            target_os = "dragonfly",
+            target_os = "freebsd",
+            target_os = "fuchsia",
+            target_os = "haiku",
+            target_os = "illumos",
+            target_os = "linux",
+            target_os = "netbsd",
+            target_os = "openbsd"
+        }
+    }
+
+    pub fn mach_o() -> proc_macro2::TokenStream {
+        quote! {
+            target_os = "ios",
+            target_os = "macos",
+            target_os = "tvos",
+            target_os = "watchos"
+        }
+    }
+}
+
+/// Attributes applied to a `static` containing a pointer to a function to run
+/// before `main`.
+fn pre_main_attrs() -> proc_macro2::TokenStream {
+    let elf = systems::elf();
+    let mach_o = systems::mach_o();
+
+    quote! {
+        #[used]
+        #[cfg_attr(windows, link_section = ".CRT$XCU")]
+        #[cfg_attr(any(#elf), link_section = ".init_array")]
+        #[cfg_attr(any(#mach_o), link_section = "__DATA,__mod_init_func,mod_init_funcs")]
+    }
+}
+
+fn unsupported_error(attr_name: &str) -> proc_macro2::TokenStream {
+    let elf = systems::elf();
+    let mach_o = systems::mach_o();
+
+    let error = format!("Unsupported target OS for `#[divan::{attr_name}]`");
+
+    quote! {
+        #[cfg(not(any(windows, #elf, #mach_o)))]
+        ::std::compile_error!(#error);
+    }
+}
+
+#[proc_macro_attribute]
+pub fn bench(options: TokenStream, item: TokenStream) -> TokenStream {
+    let option_none = tokens::option_none();
+    let option_some = tokens::option_some();
+
+    let fn_item = item.clone();
+    let fn_item = syn::parse_macro_input!(fn_item as syn::ItemFn);
+    let fn_sig = &fn_item.sig;
+
+    let attr = Macro::Bench { fn_sig };
+    let attr_name = attr.name();
+
+    let options = match AttrOptions::parse(options, attr) {
+        Ok(options) => options,
+        Err(compile_error) => return compile_error,
+    };
+
+    // Items needed by generated code.
+    let AttrOptions { private_mod, .. } = &options;
+
+    let fn_ident = &fn_sig.ident;
+    let fn_name = fn_ident.to_string();
+    let fn_name_pretty = fn_name.strip_prefix("r#").unwrap_or(&fn_name);
+
+    // Find any `#[ignore]` attribute so that we can use its span to help
+    // compiler diagnostics.
+    let ignore_attr_ident =
+        fn_item.attrs.iter().map(|attr| attr.meta.path()).find(|path| path.is_ident("ignore"));
+
+    // If the function is `extern "ABI"`, it is wrapped in a Rust-ABI function.
+    let is_extern_abi = fn_sig.abi.is_some();
+
+    let fn_args = &fn_sig.inputs;
+
+    let type_param: Option<(usize, &syn::TypeParam)> = fn_sig
+        .generics
+        .params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, param)| match param {
+            syn::GenericParam::Type(param) => Some((i, param)),
+            _ => None,
+        })
+        .next();
+
+    let const_param: Option<(usize, &syn::ConstParam)> = fn_sig
+        .generics
+        .params
+        .iter()
+        .enumerate()
+        .filter_map(|(i, param)| match param {
+            syn::GenericParam::Const(param) => Some((i, param)),
+            _ => None,
+        })
+        .next();
+
+    let is_type_before_const = match (type_param, const_param) {
+        (Some((t, _)), Some((c, _))) => t < c,
+        _ => false,
+    };
+
+    // Prefixed with "__" to prevent IDEs from recommending using this symbol.
+    //
+    // The static is local to intentionally cause a compile error if this
+    // attribute is used multiple times on the same function.
+    let static_ident = syn::Ident::new(
+        &format!("__DIVAN_BENCH_{}", fn_name_pretty.to_uppercase()),
+        fn_ident.span(),
+    );
+
+    let meta = entry_meta_expr(&fn_name, &options, ignore_attr_ident);
+
+    let bench_entry_runner = quote! { #private_mod::BenchEntryRunner };
+
+    // Creates a `__DIVAN_ARGS` global variable to be used in the entry.
+    let bench_args_global = if options.args_expr.is_some() {
+        quote! {
+            static __DIVAN_ARGS: #private_mod::BenchArgs = #private_mod::BenchArgs::new();
+        }
+    } else {
+        Default::default()
+    };
+
+    // The last argument type is used as the only `args` item type because we
+    // currently only support one runtime argument.
+    let last_arg_type = if options.args_expr.is_some() {
+        fn_args.last().map(|arg| match arg {
+            FnArg::Receiver(arg) => &*arg.ty,
+            FnArg::Typed(arg) => &*arg.ty,
+        })
+    } else {
+        None
+    };
+
+    let last_arg_type_tokens = last_arg_type
+        .map(|ty| match ty {
+            // Remove lifetime from references to not use the lifetime outside
+            // of its declaration. This allows benchmarks to take arguments with
+            // lifetimes.
+            syn::Type::Reference(ty) if ty.lifetime.is_some() => {
+                let mut ty = ty.clone();
+                ty.lifetime = None;
+                ty.to_token_stream()
+            }
+
+            _ => ty.to_token_stream(),
+        })
+        .unwrap_or_default();
+
+    // Some argument literals need an explicit type.
+    let arg_return_tokens = options
+        .args_expr
+        .as_ref()
+        .map(|args| match args {
+            // Empty array.
+            Expr::Array(args) if args.elems.is_empty() => quote! {
+                -> [#last_arg_type_tokens; 0]
+            },
+
+            _ => Default::default(),
+        })
+        .unwrap_or_default();
+
+    // Creates a function expr for the benchmarking function, optionally
+    // monomorphized with generic parameters.
+    let make_bench_fn = |generics: &[&dyn ToTokens]| {
+        let mut fn_expr = if generics.is_empty() {
+            // Use identifier as-is.
+            fn_ident.to_token_stream()
+        } else {
+            // Apply generic arguments.
+            quote! { #fn_ident::< #(#generics),* > }
+        };
+
+        // Handle function arguments.
+        match (fn_args.len(), &options.args_expr) {
+            // Simple benchmark with no arguments provided.
+            (0, None) => {
+                // Wrap in Rust ABI.
+                if is_extern_abi {
+                    fn_expr = quote! { || #fn_expr() };
+                }
+
+                quote! {
+                    #bench_entry_runner::Plain(|divan /* Bencher */| divan.bench(#fn_expr))
+                }
+            }
+
+            // `args` option used without function arguments; handled earlier in
+            // `AttrOptions::parse`.
+            (0, Some(_)) => unreachable!(),
+
+            // `Bencher` function argument.
+            (1, None) => {
+                // Wrap in Rust ABI.
+                if is_extern_abi {
+                    fn_expr = quote! { |divan /* Bencher */| #fn_expr(divan) };
+                }
+
+                quote! { #bench_entry_runner::Plain(#fn_expr) }
+            }
+
+            // Function argument comes from `args` option.
+            (1, Some(args)) => quote! {
+                #bench_entry_runner::Args(|| __DIVAN_ARGS.runner(
+                    || #arg_return_tokens { #args },
+
+                    |arg| #private_mod::ToStringHelper(arg).to_string(),
+
+                    |divan, __divan_arg| divan.bench(|| #fn_expr(
+                        #private_mod::Arg::<#last_arg_type_tokens>::get(__divan_arg)
+                    )),
+                ))
+            },
+
+            // `Bencher` and `args` option function arguments.
+            (2, Some(args)) => quote! {
+                #bench_entry_runner::Args(|| __DIVAN_ARGS.runner(
+                    || #arg_return_tokens { #args },
+
+                    |arg| #private_mod::ToStringHelper(arg).to_string(),
+
+                    |divan, __divan_arg| #fn_expr(
+                        divan,
+                        #private_mod::Arg::<#last_arg_type_tokens>::get(__divan_arg),
+                    ),
+                ))
+            },
+
+            // Ensure `args` is set if arguments are provided after `Bencher`.
+            (_, None) => quote! {
+                ::std::compile_error!(::std::concat!(
+                    "expected 'args' option containing '",
+                    ::std::stringify!(#last_arg_type_tokens),
+                    "'",
+                ))
+            },
+
+            // `args` option used with unsupported number of arguments; handled
+            // earlier in `AttrOptions::parse`.
+            (_, Some(_)) => unreachable!(),
+        }
+    };
+
+    let pre_main_attrs = pre_main_attrs();
+    let unsupported_error = unsupported_error(attr_name);
+
+    // Creates a `GroupEntry` static for generic benchmarks.
+    let make_generic_group = |generic_benches: proc_macro2::TokenStream| {
+        let entry = quote! {
+            #private_mod::GroupEntry {
+                meta: #meta,
+                generic_benches: #option_some({ #generic_benches }),
+            }
+        };
+
+        quote! {
+            #unsupported_error
+
+            // Push this static into `GROUP_ENTRIES` before `main` is called.
+            static #static_ident: #private_mod::GroupEntry = {
+                {
+                    // Add `push` to the initializer section.
+                    #pre_main_attrs
+                    static PUSH: extern "C" fn() = push;
+
+                    extern "C" fn push() {
+                        static NODE: #private_mod::EntryList<#private_mod::GroupEntry>
+                            = #private_mod::EntryList::new(&#static_ident);
+
+                        #private_mod::GROUP_ENTRIES.push(&NODE);
+                    }
+                }
+
+                // All generic entries share the same `BenchArgs` instance for
+                // efficiency and to ensure all entries use the same values, or
+                // at least the same names in the case of interior mutability.
+                #bench_args_global
+
+                #entry
+            };
+        }
+    };
+
+    // Creates a `GenericBenchEntry` expr for a generic benchmark instance.
+    let make_generic_bench_entry =
+        |ty: Option<&dyn ToTokens>, const_value: Option<&dyn ToTokens>| {
+            let generic_const_value = const_value.map(|const_value| quote!({ #const_value }));
+
+            let generics: Vec<&dyn ToTokens> = {
+                let mut generics = Vec::new();
+
+                generics.extend(generic_const_value.as_ref().map(|t| t as &dyn ToTokens));
+                generics.extend(ty);
+
+                if is_type_before_const {
+                    generics.reverse();
+                }
+
+                generics
+            };
+
+            let bench_fn = make_bench_fn(&generics);
+
+            let type_value = match ty {
+                Some(ty) => quote! {
+                    #option_some(#private_mod::EntryType::new::<#ty>())
+                },
+                None => option_none.clone(),
+            };
+
+            let const_value = match const_value {
+                Some(const_value) => quote! {
+                    #option_some(#private_mod::EntryConst::new(&#const_value))
+                },
+                None => option_none.clone(),
+            };
+
+            quote! {
+                #private_mod::GenericBenchEntry {
+                    group: &#static_ident,
+                    bench: #bench_fn,
+                    ty: #type_value,
+                    const_value: #const_value,
+                }
+            }
+        };
+
+    let generated_items: proc_macro2::TokenStream = match &options.generic.consts {
+        // Only specified `types = []` or `consts = []`; generate nothing.
+        _ if options.generic.is_empty() => Default::default(),
+
+        None => match &options.generic.types {
+            // No generics; generate a simple benchmark entry.
+            None => {
+                let bench_fn = make_bench_fn(&[]);
+
+                let entry = quote! {
+                    #private_mod::BenchEntry {
+                        meta: #meta,
+                        bench: #bench_fn,
+                    }
+                };
+
+                quote! {
+                    // Push this static into `BENCH_ENTRIES` before `main` is
+                    // called.
+                    static #static_ident: #private_mod::BenchEntry = {
+                        {
+                            // Add `push` to the initializer section.
+                            #pre_main_attrs
+                            static PUSH: extern "C" fn() = push;
+
+                            extern "C" fn push() {
+                                static NODE: #private_mod::EntryList<#private_mod::BenchEntry>
+                                    = #private_mod::EntryList::new(&#static_ident);
+
+                                #private_mod::BENCH_ENTRIES.push(&NODE);
+                            }
+                        }
+
+                        #bench_args_global
+
+                        #entry
+                    };
+                }
+            }
+
+            // Generate a benchmark group entry with generic benchmark entries.
+            Some(GenericTypes::List(generic_types)) => {
+                let generic_benches =
+                    generic_types.iter().map(|ty| make_generic_bench_entry(Some(&ty), None));
+
+                make_generic_group(quote! {
+                    &[&[#(#generic_benches),*]]
+                })
+            }
+        },
+
+        // Generate a benchmark group entry with generic benchmark entries.
+        Some(Expr::Array(generic_consts)) => {
+            let consts_count = generic_consts.elems.len();
+            let const_type = &const_param.unwrap().1.ty;
+
+            let generic_benches = options.generic.types_iter().map(|ty| {
+                let generic_benches = (0..consts_count).map(move |i| {
+                    let const_value = quote! { __DIVAN_CONSTS[#i] };
+                    make_generic_bench_entry(ty, Some(&const_value))
+                });
+
+                // `static` is necessary because `EntryConst` uses interior
+                // mutability to cache the `ToString` result.
+                quote! {
+                    static __DIVAN_GENERIC_BENCHES: [#private_mod::GenericBenchEntry; #consts_count] = [#(#generic_benches),*];
+                    &__DIVAN_GENERIC_BENCHES
+                }
+            });
+
+            make_generic_group(quote! {
+                // We refer to our own slice because it:
+                // - Type-checks values, even if `generic_benches` is empty
+                //   because the user set `types = []`
+                // - Prevents re-computing constants, which can slightly improve
+                //   compile time given that Miri is slow
+                const __DIVAN_CONSTS: &[#const_type] = &#generic_consts;
+
+                &[#({ #generic_benches }),*]
+            })
+        }
+
+        // Generate a benchmark group entry with generic benchmark entries over
+        // an expression of constants.
+        //
+        // This is limited to a maximum of 20 because we need some constant to
+        // instantiate each function instance.
+        Some(generic_consts) => {
+            // The maximum number of elements for non-array expressions.
+            const MAX_EXTERN_COUNT: usize = 20;
+
+            let const_type = &const_param.unwrap().1.ty;
+
+            let generic_benches = options.generic.types_iter().map(|ty| {
+                let generic_benches = (0..MAX_EXTERN_COUNT).map(move |i| {
+                    let const_value = quote! {
+                        // Fallback to the first constant if out of bounds.
+                        __DIVAN_CONSTS[if #i < __DIVAN_CONST_COUNT { #i } else { 0 }]
+                    };
+                    make_generic_bench_entry(ty, Some(&const_value))
+                });
+
+                // `static` is necessary because `EntryConst` uses interior
+                // mutability to cache the `ToString` result.
+                quote! {
+                    static __DIVAN_GENERIC_BENCHES: [#private_mod::GenericBenchEntry; __DIVAN_CONST_COUNT]
+                        = match #private_mod::shrink_array([#(#generic_benches),*]) {
+                            Some(array) => array,
+                            _ => panic!("external 'consts' cannot contain more than 20 values"),
+                        };
+
+                    &__DIVAN_GENERIC_BENCHES
+                }
+            });
+
+            make_generic_group(quote! {
+                const __DIVAN_CONST_COUNT: usize = __DIVAN_CONSTS.len();
+                const __DIVAN_CONSTS: &[#const_type] = &#generic_consts;
+
+                &[#({ #generic_benches }),*]
+            })
+        }
+    };
+
+    // Append our generated code to the existing token stream.
+    let mut result = item;
+    result.extend(TokenStream::from(generated_items));
+    result
+}
+
+#[proc_macro_attribute]
+pub fn bench_group(options: TokenStream, item: TokenStream) -> TokenStream {
+    let attr = Macro::BenchGroup;
+    let attr_name = attr.name();
+
+    let options = match AttrOptions::parse(options, attr) {
+        Ok(options) => options,
+        Err(compile_error) => return compile_error,
+    };
+
+    // Items needed by generated code.
+    let AttrOptions { private_mod, .. } = &options;
+
+    let option_none = tokens::option_none();
+
+    // TODO: Make module parsing cheaper by parsing only the necessary parts.
+    let mod_item = item.clone();
+    let mod_item = syn::parse_macro_input!(mod_item as syn::ItemMod);
+
+    let mod_ident = &mod_item.ident;
+    let mod_name = mod_ident.to_string();
+    let mod_name_pretty = mod_name.strip_prefix("r#").unwrap_or(&mod_name);
+
+    // Find any `#[ignore]` attribute so that we can use its span to help
+    // compiler diagnostics.
+    //
+    // TODO: Fix `unused_attributes` warning when using `#[ignore]` on a module.
+    let ignore_attr_ident =
+        mod_item.attrs.iter().map(|attr| attr.meta.path()).find(|path| path.is_ident("ignore"));
+
+    // Prefixed with "__" to prevent IDEs from recommending using this symbol.
+    //
+    // By having the static be local, we cause a compile error if this attribute
+    // is used multiple times on the same function.
+    let static_ident = syn::Ident::new(
+        &format!("__DIVAN_GROUP_{}", mod_name_pretty.to_uppercase()),
+        mod_ident.span(),
+    );
+
+    let meta = entry_meta_expr(&mod_name, &options, ignore_attr_ident);
+
+    let pre_main_attrs = pre_main_attrs();
+    let unsupported_error = unsupported_error(attr_name);
+
+    let generated_items = quote! {
+        #unsupported_error
+
+        // Push this static into `GROUP_ENTRIES` before `main` is called.
+        static #static_ident: #private_mod::EntryList<#private_mod::GroupEntry> = {
+            {
+                // Add `push` to the initializer section.
+                #pre_main_attrs
+                static PUSH: extern "C" fn() = push;
+
+                extern "C" fn push() {
+                    #private_mod::GROUP_ENTRIES.push(&#static_ident);
+                }
+            }
+
+            #private_mod::EntryList::new({
+                static #static_ident: #private_mod::GroupEntry = #private_mod::GroupEntry {
+                    meta: #meta,
+                    generic_benches: #option_none,
+                };
+
+                &#static_ident
+            })
+        };
+    };
+
+    // Append our generated code to the existing token stream.
+    let mut result = item;
+    result.extend(TokenStream::from(generated_items));
+    result
+}
+
+/// Constructs an `EntryMeta` expression.
+fn entry_meta_expr(
+    raw_name: &str,
+    options: &AttrOptions,
+    ignore_attr_ident: Option<&syn::Path>,
+) -> proc_macro2::TokenStream {
+    let AttrOptions { private_mod, .. } = &options;
+
+    let raw_name_pretty = raw_name.strip_prefix("r#").unwrap_or(raw_name);
+
+    let display_name: &dyn ToTokens = match &options.name_expr {
+        Some(name) => name,
+        None => &raw_name_pretty,
+    };
+
+    let bench_options = options.bench_options_fn(ignore_attr_ident);
+
+    quote! {
+        #private_mod::EntryMeta {
+            raw_name: #raw_name,
+            display_name: #display_name,
+            bench_options: #bench_options,
+            module_path: ::std::module_path!(),
+
+            // `Span` location info is nightly-only, so use macros.
+            location: #private_mod::EntryLocation {
+                file: ::std::file!(),
+                line: ::std::line!(),
+                col: ::std::column!(),
+            },
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/macros/src/tokens.rs b/crates/divan_compat/divan_fork/macros/src/tokens.rs
new file mode 100644
index 00000000..71a52b53
--- /dev/null
+++ b/crates/divan_compat/divan_fork/macros/src/tokens.rs
@@ -0,0 +1,15 @@
+//! Token generation utilities.
+//!
+//! These use items from the standard library as `::std`. This works unless
+//! users do `extern crate x as std`, which is extremely unlikely.
+
+use proc_macro2::TokenStream;
+use quote::quote;
+
+pub fn option_some() -> TokenStream {
+    quote!(::std::option::Option::Some)
+}
+
+pub fn option_none() -> TokenStream {
+    quote!(::std::option::Option::None)
+}
diff --git a/crates/divan_compat/divan_fork/rustfmt.toml b/crates/divan_compat/divan_fork/rustfmt.toml
new file mode 100644
index 00000000..706917c3
--- /dev/null
+++ b/crates/divan_compat/divan_fork/rustfmt.toml
@@ -0,0 +1,5 @@
+# Rust code formatting; see https://rust-lang.github.io/rustfmt
+edition = "2021"
+newline_style = "Unix"
+use_field_init_shorthand = true
+use_small_heuristics = "Max"
diff --git a/crates/divan_compat/divan_fork/src/alloc.rs b/crates/divan_compat/divan_fork/src/alloc.rs
new file mode 100644
index 00000000..a00cf4e0
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/alloc.rs
@@ -0,0 +1,644 @@
+use std::{alloc::*, fmt, ptr::NonNull};
+
+use cfg_if::cfg_if;
+
+use crate::{stats::StatsSet, util::sync::AtomicFlag};
+
+#[cfg(target_os = "macos")]
+use crate::util::{sync::CachePadded, thread::PThreadKey};
+
+#[cfg(not(target_os = "macos"))]
+use std::cell::UnsafeCell;
+
+/// The `AllocProfiler` when running crate-internal tests.
+///
+/// This enables us to test it for:
+/// - Undefined behavior with Miri
+/// - Correctness when tallying
+#[cfg(test)]
+#[global_allocator]
+static ALLOC: AllocProfiler = AllocProfiler::system();
+
+/// Whether to ignore allocation info set during the benchmark.
+pub(crate) static IGNORE_ALLOC: AtomicFlag = AtomicFlag::new(false);
+
+/// Measures [`GlobalAlloc`] memory usage.
+///
+/// # Examples
+///
+/// The default usage is to create a
+/// [`#[global_allocator]`](macro@global_allocator) that wraps the [`System`]
+/// allocator with [`AllocProfiler::system()`]:
+///
+/// ```
+/// use std::collections::*;
+/// use divan::AllocProfiler;
+///
+/// #[global_allocator]
+/// static ALLOC: AllocProfiler = AllocProfiler::system();
+///
+/// fn main() {
+///     divan::main();
+/// }
+///
+/// #[divan::bench(types = [
+///     Vec<i32>,
+///     LinkedList<i32>,
+///     HashSet<i32>,
+/// ])]
+/// fn from_iter<T>() -> T
+/// where
+///     T: FromIterator<i32>,
+/// {
+///     (0..100).collect()
+/// }
+///
+/// #[divan::bench(types = [
+///     Vec<i32>,
+///     LinkedList<i32>,
+///     HashSet<i32>,
+/// ])]
+/// fn drop<T>(bencher: divan::Bencher)
+/// where
+///     T: FromIterator<i32>,
+/// {
+///     bencher
+///         .with_inputs(|| (0..100).collect::<T>())
+///         .bench_values(std::mem::drop);
+/// }
+/// ```
+///
+/// Wrap other [`GlobalAlloc`] implementations like
+/// [`mimalloc`](https://docs.rs/mimalloc) with [`AllocProfiler::new()`]:
+///
+/// ```
+/// use divan::AllocProfiler;
+/// use mimalloc::MiMalloc;
+///
+/// # #[cfg(not(miri))]
+/// #[global_allocator]
+/// static ALLOC: AllocProfiler<MiMalloc> = AllocProfiler::new(MiMalloc);
+/// ```
+///
+/// See [`string`](https://github.com/nvzqz/divan/blob/main/examples/benches/string.rs)
+/// and [`collections`](https://github.com/nvzqz/divan/blob/main/examples/benches/collections.rs)
+/// benchmarks for more examples.
+///
+/// # Implementation
+///
+/// Collecting allocation information happens at any point during which Divan is
+/// also measuring the time. As a result, counting allocations affects timing.
+///
+/// To reduce Divan's footprint during benchmarking:
+/// - Allocation information is recorded in thread-local storage to prevent
+///   contention when benchmarks involve multiple threads, either through
+///   options like [`threads`](macro@crate::bench#threads) or internally
+///   spawning their own threads.
+/// - It does not check for overflow and assumes it will not happen. This is
+///   subject to change in the future.
+/// - Fast thread-local storage access is assembly-optimized on macOS.
+///
+/// Allocation information is the only data Divan records outside of timing, and
+/// thus it also has the only code that affects timing. Steps for recording
+/// alloc info:
+/// 1. Load the thread-local slot for allocation information.
+///
+///    On macOS, this is via the
+///    [`gs`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L34)/[`tpidrro_el0`](https://github.com/nvzqz/divan/blob/v0.1.6/src/util/sync.rs#L47)
+///    registers for
+///    [`pthread_getspecific`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_getspecific.html).
+///    Although this is not guaranteed as stable ABI, in practice many programs
+///    assume these registers store thread-local data. [`thread_local!`] is used
+///    on all other platforms.
+///
+/// 2. Increment allocation operation invocation count and bytes count
+///    (a.k.a. size).
+///
+/// Allocation information is recorded in thread-local storage to prevent
+/// slowdowns from synchronized sharing when using multiple threads, through
+/// options like [`threads`](macro@crate::bench#threads).
+///
+/// Note that allocations in threads not controlled by Divan are not currently
+/// counted.
+#[derive(Debug, Default)]
+pub struct AllocProfiler<Alloc = System> {
+    alloc: Alloc,
+}
+
+unsafe impl<A: GlobalAlloc> GlobalAlloc for AllocProfiler<A> {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        // Tally allocation count.
+        if let Some(mut info) = ThreadAllocInfo::try_current() {
+            // SAFETY: We have exclusive access.
+            let info = unsafe { info.as_mut() };
+
+            info.tally_alloc(layout.size());
+        };
+
+        self.alloc.alloc(layout)
+    }
+
+    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
+        // Tally allocation count.
+        if let Some(mut info) = ThreadAllocInfo::try_current() {
+            // SAFETY: We have exclusive access.
+            let info = unsafe { info.as_mut() };
+
+            info.tally_alloc(layout.size());
+        };
+
+        self.alloc.alloc_zeroed(layout)
+    }
+
+    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        // Tally reallocation count.
+        if let Some(mut info) = ThreadAllocInfo::try_current() {
+            // SAFETY: We have exclusive access.
+            let info = unsafe { info.as_mut() };
+
+            info.tally_realloc(layout.size(), new_size);
+        };
+
+        self.alloc.realloc(ptr, layout, new_size)
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        // Tally deallocation count.
+        if let Some(mut info) = ThreadAllocInfo::try_current() {
+            // SAFETY: We have exclusive access.
+            let info = unsafe { info.as_mut() };
+
+            info.tally_dealloc(layout.size());
+        };
+
+        self.alloc.dealloc(ptr, layout)
+    }
+}
+
+impl AllocProfiler {
+    /// Profiles the [`System`] allocator.
+    #[inline]
+    pub const fn system() -> Self {
+        Self::new(System)
+    }
+}
+
+impl<A> AllocProfiler<A> {
+    /// Profiles a [`GlobalAlloc`].
+    #[inline]
+    pub const fn new(alloc: A) -> Self {
+        Self { alloc }
+    }
+}
+
+/// Thread-local allocation information.
+#[derive(Clone, Default)]
+#[repr(C)]
+pub(crate) struct ThreadAllocInfo {
+    // NOTE: `tallies` should be ordered first so that `tally_realloc` can
+    // directly index `&self` without an offset.
+    pub tallies: ThreadAllocTallyMap,
+
+    // NOTE: Max size and count are signed for convenience but can never be
+    // negative due to it being initialized to 0.
+    //
+    // PERF: Grouping current/max fields together by count and size makes
+    // `tally_alloc` take the least time on M1 Mac.
+    pub current_count: ThreadAllocCountSigned,
+    pub max_count: ThreadAllocCountSigned,
+    pub current_size: ThreadAllocCountSigned,
+    pub max_size: ThreadAllocCountSigned,
+}
+
+#[cfg(not(target_os = "macos"))]
+thread_local! {
+    /// Instance specific to the current thread.
+    ///
+    /// On macOS, we use `ALLOC_PTHREAD_KEY` instead.
+    static CURRENT_THREAD_INFO: UnsafeCell<ThreadAllocInfo> = const {
+        UnsafeCell::new(ThreadAllocInfo::new())
+    };
+}
+
+#[cfg(target_os = "macos")]
+static ALLOC_PTHREAD_KEY: CachePadded<PThreadKey<ThreadAllocInfo>> = CachePadded(PThreadKey::new());
+
+impl ThreadAllocInfo {
+    #[inline]
+    pub const fn new() -> Self {
+        Self {
+            tallies: ThreadAllocTallyMap::new(),
+            max_count: 0,
+            current_count: 0,
+            max_size: 0,
+            current_size: 0,
+        }
+    }
+
+    /// Returns the current thread's allocation information, initializing it on
+    /// first access.
+    ///
+    /// Returns `None` if the thread is terminating and has thus deallocated its
+    /// local instance.
+    #[inline]
+    pub fn current() -> Option<NonNull<Self>> {
+        cfg_if! {
+            if #[cfg(target_os = "macos")] {
+                return Self::try_current().or_else(slow_impl);
+            } else {
+                Self::try_current()
+            }
+        }
+
+        #[cfg(target_os = "macos")]
+        #[cold]
+        #[inline(never)]
+        fn slow_impl() -> Option<NonNull<ThreadAllocInfo>> {
+            unsafe {
+                let layout = Layout::new::<ThreadAllocInfo>();
+
+                let Some(info_alloc) = NonNull::new(unsafe { System.alloc_zeroed(layout) }) else {
+                    handle_alloc_error(layout);
+                };
+
+                let success = ALLOC_PTHREAD_KEY.0.set(info_alloc.as_ptr().cast(), |this| {
+                    System.dealloc(this.as_ptr().cast(), Layout::new::<ThreadAllocInfo>());
+                });
+
+                if !success {
+                    System.dealloc(info_alloc.as_ptr(), layout);
+                    return None;
+                }
+
+                // When using static thread local key, write directly because it
+                // is undefined behavior to call `pthread_setspecific` with a
+                // key that didn't originate from `pthread_key_create`.
+                #[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
+                unsafe {
+                    crate::util::thread::fast::set_static_thread_local(info_alloc.as_ptr());
+                };
+
+                Some(info_alloc.cast())
+            }
+        }
+    }
+
+    /// Returns the current thread's allocation information if initialized.
+    ///
+    /// Returns `None` if the instance has not yet been allocated or the thread
+    /// is terminating and has thus deallocated its local instance.
+    #[inline]
+    pub fn try_current() -> Option<NonNull<Self>> {
+        cfg_if! {
+            if #[cfg(target_os = "macos")] {
+                // Fast path: static thread local.
+                #[cfg(all(
+                    not(miri),
+                    not(feature = "dyn_thread_local"),
+                    target_arch = "x86_64",
+                ))]
+                return NonNull::new(unsafe {
+                    crate::util::thread::fast::get_static_thread_local::<Self>().cast_mut()
+                });
+
+                #[allow(unreachable_code)]
+                ALLOC_PTHREAD_KEY.0.get()
+            } else {
+                CURRENT_THREAD_INFO.try_with(|info| unsafe {
+                    NonNull::new_unchecked(info.get())
+                }).ok()
+            }
+        }
+    }
+
+    /// Sets 0 to all values.
+    pub fn clear(&mut self) {
+        *self = Self::new();
+    }
+
+    /// Tallies the total count and size of the allocation operation.
+    #[inline]
+    pub fn tally_alloc(&mut self, size: usize) {
+        self.tally_op(AllocOp::Alloc, size);
+
+        self.current_count += 1;
+        self.max_count = self.max_count.max(self.current_count);
+
+        self.current_size += size as ThreadAllocCountSigned;
+        self.max_size = self.max_size.max(self.current_size);
+    }
+
+    /// Tallies the total count and size of the deallocation operation.
+    #[inline]
+    pub fn tally_dealloc(&mut self, size: usize) {
+        self.tally_op(AllocOp::Dealloc, size);
+
+        self.current_count -= 1;
+        self.current_size -= size as ThreadAllocCountSigned;
+    }
+
+    /// Tallies the total count and size of the reallocation operation.
+    #[inline]
+    pub fn tally_realloc(&mut self, old_size: usize, new_size: usize) {
+        let (diff, is_shrink) = new_size.overflowing_sub(old_size);
+        let diff = diff as isize;
+        let abs_diff = diff.wrapping_abs() as usize;
+
+        self.tally_op(AllocOp::realloc(is_shrink), abs_diff);
+
+        // NOTE: Realloc does not change allocation count.
+        self.current_size += diff as ThreadAllocCountSigned;
+        self.max_size = self.max_size.max(self.current_size);
+    }
+
+    /// Tallies the total count and size of the allocation operation.
+    #[inline]
+    fn tally_op(&mut self, op: AllocOp, size: usize) {
+        let tally = self.tallies.get_mut(op);
+        tally.count += 1;
+        tally.size += size as ThreadAllocCount;
+    }
+}
+
+/// Allocation numbers being accumulated.
+///
+/// # Memory Layout
+///
+/// Aligning to 16 nudges the compiler to emit aligned SIMD operations.
+///
+/// Placing `count` first generates less code on AArch64.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+#[repr(C, align(16))]
+pub(crate) struct AllocTally<Count> {
+    /// The number of times this operation was performed.
+    pub count: Count,
+
+    /// The amount of memory this operation changed.
+    pub size: Count,
+}
+
+pub(crate) type ThreadAllocCount = condtype::num::Usize64;
+pub(crate) type ThreadAllocCountSigned = condtype::num::Isize64;
+
+pub(crate) type ThreadAllocTally = AllocTally<ThreadAllocCount>;
+
+pub(crate) type TotalAllocTally = AllocTally<u128>;
+
+impl AllocTally<StatsSet<f64>> {
+    pub fn is_zero(&self) -> bool {
+        self.count.is_zero() && self.size.is_zero()
+    }
+}
+
+impl<C> AllocTally<C> {
+    #[inline]
+    pub fn as_array(&self) -> &[C; 2] {
+        // SAFETY: This is `#[repr(C)]`, so we can treat it as a contiguous
+        // sequence of items.
+        unsafe { &*(self as *const _ as *const _) }
+    }
+}
+
+/// Allocation number categories.
+///
+/// Note that grow/shrink are first to improve code generation for `realloc`.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub(crate) enum AllocOp {
+    Grow,
+    Shrink,
+    Alloc,
+    Dealloc,
+}
+
+impl AllocOp {
+    pub const ALL: [Self; 4] = {
+        use AllocOp::*;
+
+        // Use same order as declared so that it can be indexed as-is.
+        [Grow, Shrink, Alloc, Dealloc]
+    };
+
+    #[inline]
+    pub fn realloc(shrink: bool) -> Self {
+        // This generates the same code as `std::mem::transmute`.
+        if shrink {
+            Self::Shrink
+        } else {
+            Self::Grow
+        }
+    }
+
+    #[inline]
+    pub fn name(self) -> &'static str {
+        match self {
+            Self::Grow => "grow",
+            Self::Shrink => "shrink",
+            Self::Alloc => "alloc",
+            Self::Dealloc => "dealloc",
+        }
+    }
+
+    #[inline]
+    pub fn prefix(self) -> &'static str {
+        match self {
+            Self::Grow => "grow:",
+            Self::Shrink => "shrink:",
+            Self::Alloc => "alloc:",
+            Self::Dealloc => "dealloc:",
+        }
+    }
+}
+
+/// Values keyed by `AllocOp`.
+#[derive(Clone, Copy, Default, PartialEq, Eq)]
+pub(crate) struct AllocOpMap<T> {
+    pub values: [T; 4],
+}
+
+pub(crate) type ThreadAllocTallyMap = AllocOpMap<ThreadAllocTally>;
+
+pub(crate) type TotalAllocTallyMap = AllocOpMap<TotalAllocTally>;
+
+impl<T: fmt::Debug> fmt::Debug for AllocOpMap<T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_map().entries(AllocOp::ALL.iter().map(|&op| (op.name(), self.get(op)))).finish()
+    }
+}
+
+impl ThreadAllocTallyMap {
+    #[inline]
+    pub const fn new() -> Self {
+        unsafe { std::mem::transmute([0u8; size_of::<Self>()]) }
+    }
+
+    /// Returns `true` if all tallies are 0.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.values.iter().all(|tally| tally.count == 0 && tally.size == 0)
+    }
+
+    pub fn add_to_total(&self, total: &mut TotalAllocTallyMap) {
+        for (i, value) in self.values.iter().enumerate() {
+            total.values[i].count += value.count as u128;
+            total.values[i].size += value.size as u128;
+        }
+    }
+}
+
+impl<T> AllocOpMap<T> {
+    #[cfg(test)]
+    pub fn from_fn<F>(f: F) -> Self
+    where
+        F: FnMut(AllocOp) -> T,
+    {
+        Self { values: AllocOp::ALL.map(f) }
+    }
+
+    #[inline]
+    pub const fn get(&self, op: AllocOp) -> &T {
+        &self.values[op as usize]
+    }
+
+    #[inline]
+    pub fn get_mut(&mut self, op: AllocOp) -> &mut T {
+        &mut self.values[op as usize]
+    }
+}
+
+#[cfg(feature = "internal_benches")]
+mod benches {
+    use super::*;
+
+    // We want the approach to scale well with thread count.
+    const THREADS: &[usize] = &[0, 1, 2, 4, 16];
+
+    #[crate::bench(crate = crate, threads = THREADS)]
+    fn tally_alloc(bencher: crate::Bencher) {
+        IGNORE_ALLOC.set(true);
+
+        // Using 0 simulates tallying without affecting benchmark reporting.
+        let size = crate::black_box(0);
+
+        bencher.bench(|| {
+            if let Some(mut info) = ThreadAllocInfo::try_current() {
+                // SAFETY: We have exclusive access.
+                let info = unsafe { info.as_mut() };
+
+                info.tally_alloc(size);
+            }
+        })
+    }
+
+    #[crate::bench(crate = crate, threads = THREADS)]
+    fn tally_dealloc(bencher: crate::Bencher) {
+        IGNORE_ALLOC.set(true);
+
+        // Using 0 simulates tallying without affecting benchmark reporting.
+        let size = crate::black_box(0);
+
+        bencher.bench(|| {
+            if let Some(mut info) = ThreadAllocInfo::try_current() {
+                // SAFETY: We have exclusive access.
+                let info = unsafe { info.as_mut() };
+
+                info.tally_dealloc(size);
+            }
+        })
+    }
+
+    #[crate::bench(crate = crate, threads = THREADS)]
+    fn tally_realloc(bencher: crate::Bencher) {
+        IGNORE_ALLOC.set(true);
+
+        // Using 0 simulates tallying without affecting benchmark reporting.
+        let new_size = crate::black_box(0);
+        let old_size = crate::black_box(0);
+
+        bencher.bench(|| {
+            if let Some(mut info) = ThreadAllocInfo::try_current() {
+                // SAFETY: We have exclusive access.
+                let info = unsafe { info.as_mut() };
+
+                info.tally_realloc(old_size, new_size);
+            }
+        })
+    }
+
+    #[crate::bench_group(crate = crate, threads = THREADS)]
+    mod current {
+        use super::*;
+
+        #[crate::bench(crate = crate)]
+        fn init() -> Option<NonNull<ThreadAllocInfo>> {
+            ThreadAllocInfo::current()
+        }
+
+        #[crate::bench(crate = crate)]
+        fn r#try() -> Option<NonNull<ThreadAllocInfo>> {
+            ThreadAllocInfo::try_current()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Tests that `AllocProfiler` is counting correctly.
+    #[test]
+    fn tally() {
+        // Initialize the thread's alloc info.
+        //
+        // SAFETY: This cannot be kept as a reference and is instead a raw
+        // pointer because a reference would cause undefined behavior when
+        // `AllocProfiler` attempts to update tallies.
+        let mut alloc_info = ThreadAllocInfo::current().unwrap();
+
+        // Resets the allocation tallies and returns the previous tallies.
+        let mut take_alloc_tallies = || std::mem::take(unsafe { &mut alloc_info.as_mut().tallies });
+
+        // Start fresh.
+        _ = take_alloc_tallies();
+
+        // Helper to create `ThreadAllocTallyMap` since each operation only
+        // changes `buf` by 1 `i32`.
+        let item_tally = ThreadAllocTally { count: 1, size: size_of::<i32>() as _ };
+        let make_tally_map = |op: AllocOp| {
+            ThreadAllocTallyMap::from_fn(|other_op| {
+                if other_op == op {
+                    item_tally
+                } else {
+                    Default::default()
+                }
+            })
+        };
+
+        // Test zero.
+        let mut buf: Vec<i32> = Vec::new();
+        assert_eq!(take_alloc_tallies(), Default::default());
+
+        // Test allocation.
+        buf.reserve_exact(1);
+        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Alloc));
+
+        // Test grow.
+        buf.reserve_exact(2);
+        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Grow));
+
+        // Test shrink.
+        buf.shrink_to(1);
+        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Shrink));
+
+        // Test dealloc.
+        drop(buf);
+        assert_eq!(take_alloc_tallies(), make_tally_map(AllocOp::Dealloc));
+
+        // Test all of the above together.
+        let mut buf: Vec<i32> = Vec::new();
+        buf.reserve_exact(1); // alloc
+        buf.reserve_exact(2); // grow
+        buf.shrink_to(1); // shrink
+        drop(buf); // dealloc
+        assert_eq!(take_alloc_tallies(), ThreadAllocTallyMap { values: [item_tally; 4] });
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/bench/args.rs b/crates/divan_compat/divan_fork/src/bench/args.rs
new file mode 100644
index 00000000..62beb207
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/bench/args.rs
@@ -0,0 +1,338 @@
+//! Types used to implement runtime argument support.
+
+use std::{
+    any::{Any, TypeId},
+    borrow::Cow,
+    mem, slice,
+    sync::OnceLock,
+};
+
+use crate::{util::ty::TypeCast, Bencher};
+
+/// Holds lazily-initialized runtime arguments to be passed into a benchmark.
+///
+/// `#[divan::bench]` stores this as a `__DIVAN_ARGS` global for each entry, and
+/// then at runtime it is initialized once by a closure that creates the usable
+/// `BenchArgsRunner`.
+pub struct BenchArgs {
+    args: OnceLock<ErasedArgsSlice>,
+}
+
+/// The result of making `BenchArgs` runnable from instantiating the arguments
+/// list and providing a typed benchmarking implementation.
+#[derive(Clone, Copy)]
+pub struct BenchArgsRunner {
+    args: &'static ErasedArgsSlice,
+    bench: fn(Bencher, &ErasedArgsSlice, arg_index: usize),
+}
+
+/// Type-erased `&'static [T]` that also stores names of the arguments.
+struct ErasedArgsSlice {
+    /// The start of `&[T]`.
+    args: *const (),
+
+    /// The start of `&[&'static str]`.
+    names: *const &'static str,
+
+    /// The number of arguments.
+    len: usize,
+
+    /// The ID of `T` to ensure correctness.
+    arg_type: TypeId,
+}
+
+// SAFETY: Raw pointers in `ErasedArgsSlice` are used in a thread-safe way, and
+// the argument type is required to be `Send + Sync` when initialized from the
+// iterator in `BenchArgs::runner`.
+unsafe impl Send for ErasedArgsSlice {}
+unsafe impl Sync for ErasedArgsSlice {}
+
+impl BenchArgs {
+    /// Creates an uninitialized instance.
+    pub const fn new() -> Self {
+        Self { args: OnceLock::new() }
+    }
+
+    /// Initializes `self` with the results of `make_args` and returns a
+    /// `BenchArgsRunner` that will execute the benchmarking closure.
+    pub fn runner<I, B>(
+        &'static self,
+        make_args: impl FnOnce() -> I,
+        arg_to_string: impl Fn(&I::Item) -> String,
+        _bench_impl: B,
+    ) -> BenchArgsRunner
+    where
+        I: IntoIterator,
+        I::Item: Any + Send + Sync,
+        B: FnOnce(Bencher, &I::Item) + Copy,
+    {
+        let args = self.args.get_or_init(|| {
+            let args_iter = make_args().into_iter();
+
+            // Reuse arguments for names if already a slice of strings.
+            //
+            // NOTE: We do this over `I::IntoIter` instead of `I` since it works
+            // for both slices and `slice::Iter`.
+            let args_strings: Option<&'static [&str]> =
+                args_iter.cast_ref::<slice::Iter<&str>>().map(|iter| iter.as_slice());
+
+            // Collect arguments into leaked slice.
+            //
+            // Leaking the collected `args` simplifies memory management, such
+            // as when reusing for `names`. We're leaking anyways since this is
+            // accessed via a global `OnceLock`.
+            //
+            // PERF: We could optimize this to reuse arguments when users
+            // provide slices. However, for slices its `Item` is a reference, so
+            // `slice::Iter<I::Item>` would never match here. To make this
+            // optimization, we would need to be able to get the referee type.
+            let args: &'static [I::Item] = Box::leak(args_iter.collect());
+
+            // Collect printable representations of arguments.
+            //
+            // PERF: We take multiple opportunities to reuse the provided
+            // arguments buffer or individual strings' buffers:
+            // - `&[&str]`
+            // - `IntoIterator<Item = &str>`
+            // - `IntoIterator<Item = String>`
+            // - `IntoIterator<Item = Box<str>>`
+            // - `IntoIterator<Item = Cow<str>>`
+            let names: &'static [&str] = 'names: {
+                // PERF: Reuse arguments strings slice.
+                if let Some(args) = args_strings {
+                    break 'names args;
+                }
+
+                // PERF: Reuse our args slice allocation.
+                if let Some(args) = args.cast_ref::<&[&str]>() {
+                    break 'names args;
+                }
+
+                Box::leak(
+                    args.iter()
+                        .map(|arg| -> &str {
+                            // PERF: Reuse strings as-is.
+                            if let Some(arg) = arg.cast_ref::<String>() {
+                                return arg;
+                            }
+                            if let Some(arg) = arg.cast_ref::<Box<str>>() {
+                                return arg;
+                            }
+                            if let Some(arg) = arg.cast_ref::<Cow<str>>() {
+                                return arg;
+                            }
+
+                            // Default to `arg_to_string`, which will format via
+                            // either `ToString` or `Debug`.
+                            Box::leak(arg_to_string(arg).into_boxed_str())
+                        })
+                        .collect(),
+                )
+            };
+
+            ErasedArgsSlice {
+                // We `black_box` arguments to prevent the compiler from
+                // optimizing the benchmark for the provided values.
+                args: crate::black_box(args.as_ptr().cast()),
+                names: names.as_ptr(),
+                len: args.len(),
+                arg_type: TypeId::of::<I::Item>(),
+            }
+        });
+
+        BenchArgsRunner { args, bench: bench::<I::Item, B> }
+    }
+}
+
+impl Default for BenchArgs {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BenchArgsRunner {
+    #[inline]
+    pub(crate) fn bench(&self, bencher: Bencher, index: usize) {
+        (self.bench)(bencher, self.args, index)
+    }
+
+    #[inline]
+    pub(crate) fn arg_names(&self) -> &'static [&'static str] {
+        self.args.names()
+    }
+}
+
+impl ErasedArgsSlice {
+    /// Retrieves a slice of arguments if the type is `T`.
+    #[inline]
+    fn typed_args<T: Any>(&self) -> Option<&[T]> {
+        if self.arg_type == TypeId::of::<T>() {
+            // SAFETY: `BenchArgs::runner` guarantees storing `len` instances.
+            Some(unsafe { slice::from_raw_parts(self.args.cast(), self.len) })
+        } else {
+            None
+        }
+    }
+
+    /// Returns the arguments' names.
+    ///
+    /// Names are in the same order as args and thus their indices can be used
+    /// to reference arguments.
+    #[inline]
+    fn names(&self) -> &'static [&str] {
+        // SAFETY: `BenchArgs::runner` guarantees storing `len` names.
+        unsafe { slice::from_raw_parts(self.names, self.len) }
+    }
+}
+
+/// The `BenchArgsRunner.bench` implementation.
+fn bench<T, B>(bencher: Bencher, erased_args: &ErasedArgsSlice, arg_index: usize)
+where
+    T: Any,
+    B: FnOnce(Bencher, &T) + Copy,
+{
+    // We defer type checking until the benchmark is run to make safety of this
+    // function easier to audit. Checking here instead of in `BenchArgs::runner`
+    // is late but fine since this check will only fail due to a bug in Divan's
+    // macro code generation.
+
+    let Some(typed_args) = erased_args.typed_args::<T>() else {
+        type_mismatch::<T>();
+
+        // Reduce code size by using a separate function for each `T` instead of
+        // each benchmark closure.
+        #[cold]
+        #[inline(never)]
+        fn type_mismatch<T>() -> ! {
+            unreachable!("incorrect type '{}'", std::any::type_name::<T>())
+        }
+    };
+
+    // SAFETY: The closure is a ZST, so we can construct one out of thin air.
+    // This can be done multiple times without invoking a `Drop` destructor
+    // because it implements `Copy`.
+    let bench_impl: B = unsafe {
+        assert_eq!(size_of::<B>(), 0, "benchmark closure expected to be zero-sized");
+        mem::zeroed()
+    };
+
+    bench_impl(bencher, &typed_args[arg_index]);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test that optimizations for string items are applied.
+    mod optimizations {
+        use std::borrow::Borrow;
+
+        use super::*;
+
+        /// Tests that two slices contain the same exact strings.
+        fn test_eq_ptr<A: Borrow<str>, B: Borrow<str>>(a: &[A], b: &[B]) {
+            assert_eq!(a.len(), b.len());
+
+            for (a, b) in a.iter().zip(b) {
+                let a = a.borrow();
+                let b = b.borrow();
+                assert_eq!(a, b);
+                assert_eq!(a.as_ptr(), b.as_ptr());
+            }
+        }
+
+        /// Tests that `&[&str]` reuses the original slice for names.
+        #[test]
+        fn str_slice() {
+            static ARGS: BenchArgs = BenchArgs::new();
+            static ORIG_ARGS: &[&str] = &["a", "b"];
+
+            let runner = ARGS.runner(|| ORIG_ARGS, ToString::to_string, |_, _| {});
+
+            let typed_args: Vec<&str> =
+                runner.args.typed_args::<&&str>().unwrap().iter().copied().copied().collect();
+            let names = runner.arg_names();
+
+            // Test values.
+            assert_eq!(names, ORIG_ARGS);
+            assert_eq!(names, typed_args);
+
+            // Test addresses.
+            assert_eq!(names.as_ptr(), ORIG_ARGS.as_ptr());
+            assert_ne!(names.as_ptr(), typed_args.as_ptr());
+        }
+
+        /// Tests optimizing `IntoIterator<Item = &str>` to reuse the same
+        /// allocation for also storing argument names.
+        #[test]
+        fn str_array() {
+            static ARGS: BenchArgs = BenchArgs::new();
+
+            let runner = ARGS.runner(|| ["a", "b"], ToString::to_string, |_, _| {});
+
+            let typed_args = runner.args.typed_args::<&str>().unwrap();
+            let names = runner.arg_names();
+
+            // Test values.
+            assert_eq!(names, ["a", "b"]);
+            assert_eq!(names, typed_args);
+
+            // Test addresses.
+            assert_eq!(names.as_ptr(), typed_args.as_ptr());
+        }
+
+        /// Tests optimizing `IntoIterator<Item = String>` to reuse the same
+        /// allocation for also storing argument names.
+        #[test]
+        fn string_array() {
+            static ARGS: BenchArgs = BenchArgs::new();
+
+            let runner =
+                ARGS.runner(|| ["a".to_owned(), "b".to_owned()], ToString::to_string, |_, _| {});
+
+            let typed_args = runner.args.typed_args::<String>().unwrap();
+            let names = runner.arg_names();
+
+            assert_eq!(names, ["a", "b"]);
+            test_eq_ptr(names, typed_args);
+        }
+
+        /// Tests optimizing `IntoIterator<Item = Box<str>>` to reuse the same
+        /// allocation for also storing argument names.
+        #[test]
+        fn box_str_array() {
+            static ARGS: BenchArgs = BenchArgs::new();
+
+            let runner = ARGS.runner(
+                || ["a".to_owned().into_boxed_str(), "b".to_owned().into_boxed_str()],
+                ToString::to_string,
+                |_, _| {},
+            );
+
+            let typed_args = runner.args.typed_args::<Box<str>>().unwrap();
+            let names = runner.arg_names();
+
+            assert_eq!(names, ["a", "b"]);
+            test_eq_ptr(names, typed_args);
+        }
+
+        /// Tests optimizing `IntoIterator<Item = Cow<str>>` to reuse the same
+        /// allocation for also storing argument names.
+        #[test]
+        fn cow_str_array() {
+            static ARGS: BenchArgs = BenchArgs::new();
+
+            let runner = ARGS.runner(
+                || [Cow::Owned("a".to_owned()), Cow::Borrowed("b")],
+                ToString::to_string,
+                |_, _| {},
+            );
+
+            let typed_args = runner.args.typed_args::<Cow<str>>().unwrap();
+            let names = runner.arg_names();
+
+            assert_eq!(names, ["a", "b"]);
+            test_eq_ptr(names, typed_args);
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/bench/defer.rs b/crates/divan_compat/divan_fork/src/bench/defer.rs
new file mode 100644
index 00000000..67d12f67
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/bench/defer.rs
@@ -0,0 +1,188 @@
+use std::{
+    cell::UnsafeCell,
+    mem::{ManuallyDrop, MaybeUninit},
+};
+
+/// Defers input usage and output drop during benchmarking.
+///
+/// To reduce memory usage, this only allocates storage for inputs if outputs do
+/// not need deferred drop.
+pub(crate) union DeferStore<I, O> {
+    /// The variant used if outputs need to be dropped.
+    ///
+    /// Inputs are stored are stored contiguously with outputs in memory. This
+    /// improves performance by:
+    /// - Removing the overhead of `zip` between two separate buffers.
+    /// - Improving cache locality and cache prefetching. Input is strategically
+    ///   placed before output because iteration is from low to high addresses,
+    ///   so doing this makes memory access patterns very predictable.
+    slots: ManuallyDrop<Vec<DeferSlot<I, O>>>,
+
+    /// The variant used if `Self::ONLY_INPUTS`, i.e. outputs do not need to be
+    /// dropped.
+    inputs: ManuallyDrop<Vec<DeferSlotItem<I>>>,
+}
+
+impl<I, O> Drop for DeferStore<I, O> {
+    #[inline]
+    fn drop(&mut self) {
+        // SAFETY: The correct variant is used based on `ONLY_INPUTS`.
+        unsafe {
+            if Self::ONLY_INPUTS {
+                ManuallyDrop::drop(&mut self.inputs)
+            } else {
+                ManuallyDrop::drop(&mut self.slots)
+            }
+        }
+    }
+}
+
+impl<I, O> Default for DeferStore<I, O> {
+    #[inline]
+    fn default() -> Self {
+        // SAFETY: The correct variant is used based on `ONLY_INPUTS`.
+        unsafe {
+            if Self::ONLY_INPUTS {
+                Self { inputs: ManuallyDrop::new(Vec::new()) }
+            } else {
+                Self { slots: ManuallyDrop::new(Vec::new()) }
+            }
+        }
+    }
+}
+
+impl<I, O> DeferStore<I, O> {
+    /// Whether only inputs need to be deferred.
+    ///
+    /// If `true`, outputs do not get inserted into `DeferStore`.
+    const ONLY_INPUTS: bool = !std::mem::needs_drop::<O>();
+
+    /// Prepares storage for iterating over `DeferSlot`s for a sample.
+    #[inline]
+    pub fn prepare(&mut self, sample_size: usize) {
+        // Common implementation regardless of `Vec` item type.
+        macro_rules! imp {
+            ($vec:expr) => {{
+                $vec.clear();
+                $vec.reserve_exact(sample_size);
+
+                // SAFETY: `Vec` only contains `MaybeUninit` fields, so values
+                // may be safely created from uninitialized memory.
+                unsafe { $vec.set_len(sample_size) }
+            }};
+        }
+
+        // SAFETY: The correct variant is used based on `ONLY_INPUTS`.
+        unsafe {
+            if Self::ONLY_INPUTS {
+                imp!(self.inputs)
+            } else {
+                imp!(self.slots)
+            }
+        }
+    }
+
+    /// Returns the sample's slots for iteration.
+    ///
+    /// The caller is expected to use the returned slice to initialize inputs
+    /// for the sample loop.
+    ///
+    /// This returns `Err` containing only input slots if `O` does not need
+    /// deferred drop. Ideally this would be implemented directly on `DeferSlot`
+    /// but there's no way to change its size based on `needs_drop::<O>()`.
+    #[inline(always)]
+    pub fn slots(&self) -> Result<&[DeferSlot<I, O>], &[DeferSlotItem<I>]> {
+        unsafe {
+            if Self::ONLY_INPUTS {
+                Err(&self.inputs)
+            } else {
+                Ok(&self.slots)
+            }
+        }
+    }
+}
+
+/// Storage for a single iteration within a sample.
+///
+/// Input is stored before output to improve cache prefetching since iteration
+/// progresses from low to high addresses.
+///
+/// # UnsafeCell
+///
+/// `UnsafeCell` is used to allow `output` to safely refer to `input`. Although
+/// `output` itself is never aliased, it is also stored as `UnsafeCell` in order
+/// to get mutable access through a shared `&DeferSlot`.
+///
+/// # Safety
+///
+/// All fields **must** be `MaybeUninit`. This allows us to safely set the
+/// length of `Vec<DeferSlot>` within the allocated capacity.
+#[repr(C)]
+pub(crate) struct DeferSlot<I, O> {
+    pub input: DeferSlotItem<I>,
+    pub output: DeferSlotItem<O>,
+}
+
+type DeferSlotItem<T> = UnsafeCell<MaybeUninit<T>>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Tests that accessing an uninitialized `DeferSlot` is safe due to all of
+    /// its fields being `MaybeUninit`.
+    #[test]
+    fn access_uninit_slot() {
+        let mut slot: MaybeUninit<DeferSlot<String, String>> = MaybeUninit::uninit();
+
+        let slot_ref = unsafe { slot.assume_init_mut() };
+        slot_ref.input = UnsafeCell::new(MaybeUninit::new(String::new()));
+        slot_ref.output = UnsafeCell::new(MaybeUninit::new(String::new()));
+
+        unsafe {
+            let slot = slot.assume_init();
+            assert_eq!(slot.input.into_inner().assume_init(), "");
+            assert_eq!(slot.output.into_inner().assume_init(), "");
+        }
+    }
+
+    /// Tests that accessing `DeferSlot.input` through an aliased reference in
+    /// `DeferSlot.output` is safe due `input` being an `UnsafeCell`.
+    #[test]
+    fn access_aliased_input() {
+        struct Output<'i> {
+            input: &'i mut String,
+        }
+
+        impl Drop for Output<'_> {
+            fn drop(&mut self) {
+                assert_eq!(self.input, "hello");
+                self.input.push_str(" world");
+            }
+        }
+
+        let slot: MaybeUninit<DeferSlot<String, Output>> = MaybeUninit::uninit();
+        let slot_ref = unsafe { slot.assume_init_ref() };
+
+        // Loop to ensure previous iterations don't affect later uses of the
+        // same entry slot.
+        for _ in 0..5 {
+            unsafe {
+                let input_ptr = slot_ref.input.get().cast::<String>();
+                let output_ptr = slot_ref.output.get().cast::<Output>();
+
+                // Initialize input and output.
+                input_ptr.write("hello".to_owned());
+                output_ptr.write(Output { input: &mut *input_ptr });
+
+                // Use and discard output.
+                assert_eq!((*output_ptr).input, "hello");
+                output_ptr.drop_in_place();
+                assert_eq!(&*input_ptr, "hello world");
+
+                // Discard input.
+                input_ptr.drop_in_place();
+            }
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/bench/mod.rs b/crates/divan_compat/divan_fork/src/bench/mod.rs
new file mode 100644
index 00000000..a8e730b8
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/bench/mod.rs
@@ -0,0 +1,1299 @@
+use std::{
+    cell::UnsafeCell,
+    fmt,
+    mem::{self, MaybeUninit},
+    num::NonZeroUsize,
+    sync::Barrier,
+};
+
+use crate::{
+    alloc::{
+        AllocOp, AllocOpMap, AllocTally, ThreadAllocInfo, ThreadAllocTally, TotalAllocTallyMap,
+    },
+    black_box, black_box_drop,
+    counter::{
+        AnyCounter, AsCountUInt, BytesCount, CharsCount, Counter, CounterCollection, CyclesCount,
+        IntoCounter, ItemsCount, KnownCounterKind, MaxCountUInt,
+    },
+    divan::SharedContext,
+    stats::{RawSample, SampleCollection, Stats, StatsSet, TimeSample},
+    thread_pool::BENCH_POOL,
+    time::{FineDuration, Timestamp, UntaggedTimestamp},
+    util::{self, sync::SyncWrap, Unit},
+};
+
+#[cfg(test)]
+mod tests;
+
+mod args;
+mod defer;
+mod options;
+
+use defer::{DeferSlot, DeferStore};
+
+pub use self::{
+    args::{BenchArgs, BenchArgsRunner},
+    options::BenchOptions,
+};
+
+pub(crate) const DEFAULT_SAMPLE_COUNT: u32 = 100;
+
+/// Enables contextual benchmarking in [`#[divan::bench]`](attr.bench.html).
+///
+/// # Examples
+///
+/// ```
+/// use divan::{Bencher, black_box};
+///
+/// #[divan::bench]
+/// fn copy_from_slice(bencher: Bencher) {
+///     // Input and output buffers get used in the closure.
+///     let src = (0..100).collect::<Vec<i32>>();
+///     let mut dst = vec![0; src.len()];
+///
+///     bencher.bench_local(|| {
+///         black_box(&mut dst).copy_from_slice(black_box(&src));
+///     });
+/// }
+/// ```
+#[must_use = "a benchmark function must be registered"]
+pub struct Bencher<'a, 'b, C = BencherConfig> {
+    pub(crate) context: &'a mut BenchContext<'b>,
+    pub(crate) config: C,
+}
+
+/// Public-in-private type for statically-typed `Bencher` configuration.
+///
+/// This enables configuring `Bencher` using the builder pattern with zero
+/// runtime cost.
+pub struct BencherConfig<GenI = Unit> {
+    gen_input: GenI,
+}
+
+impl<C> fmt::Debug for Bencher<'_, '_, C> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Bencher").finish_non_exhaustive()
+    }
+}
+
+impl<'a, 'b> Bencher<'a, 'b> {
+    #[inline]
+    pub(crate) fn new(context: &'a mut BenchContext<'b>) -> Self {
+        Self { context, config: BencherConfig { gen_input: Unit } }
+    }
+}
+
+impl<'a, 'b> Bencher<'a, 'b> {
+    /// Benchmarks a function.
+    ///
+    /// The function can be benchmarked in parallel using the [`threads`
+    /// option](macro@crate::bench#threads). If the function is strictly
+    /// single-threaded, use [`Bencher::bench_local`] instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher.bench(|| {
+    ///         // Benchmarked code...
+    ///     });
+    /// }
+    /// ```
+    pub fn bench<O, B>(self, benched: B)
+    where
+        B: Fn() -> O + Sync,
+    {
+        // Reusing `bench_values` for a zero-sized non-drop input type should
+        // have no overhead.
+        self.with_inputs(|| ()).bench_values(|_: ()| benched());
+    }
+
+    /// Benchmarks a function on the current thread.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher.bench_local(|| {
+    ///         // Benchmarked code...
+    ///     });
+    /// }
+    /// ```
+    pub fn bench_local<O, B>(self, mut benched: B)
+    where
+        B: FnMut() -> O,
+    {
+        // Reusing `bench_local_values` for a zero-sized non-drop input type
+        // should have no overhead.
+        self.with_inputs(|| ()).bench_local_values(|_: ()| benched());
+    }
+
+    /// Generate inputs for the [benchmarked function](#input-bench).
+    ///
+    /// Time spent generating inputs does not affect benchmark timing.
+    ///
+    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
+    /// generator is called on the same thread as the sample loop that uses that
+    /// input.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| {
+    ///             // Generate input:
+    ///             String::from("...")
+    ///         })
+    ///         .bench_values(|s| {
+    ///             // Use input by-value:
+    ///             s + "123"
+    ///         });
+    /// }
+    /// ```
+    pub fn with_inputs<G>(self, gen_input: G) -> Bencher<'a, 'b, BencherConfig<G>> {
+        Bencher { context: self.context, config: BencherConfig { gen_input } }
+    }
+}
+
+impl<'a, 'b, GenI> Bencher<'a, 'b, BencherConfig<GenI>> {
+    /// Assign a [`Counter`] for all iterations of the benchmarked function.
+    ///
+    /// This will either:
+    /// - Assign a new counter
+    /// - Override an existing counter of the same type
+    ///
+    /// If the counter depends on [generated inputs](Self::with_inputs), use
+    /// [`Bencher::input_counter`] instead.
+    ///
+    /// If context is not needed, the counter can instead be set via
+    /// [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use divan::{Bencher, counter::BytesCount};
+    ///
+    /// #[divan::bench]
+    /// fn char_count(bencher: Bencher) {
+    ///     let s: String = // ...
+    ///     # String::new();
+    ///
+    ///     bencher
+    ///         .counter(BytesCount::of_str(&s))
+    ///         .bench(|| {
+    ///             divan::black_box(&s).chars().count()
+    ///         });
+    /// }
+    /// ```
+    #[doc(alias = "throughput")]
+    pub fn counter<C>(self, counter: C) -> Self
+    where
+        C: IntoCounter,
+    {
+        let counter = AnyCounter::new(counter);
+        self.context.counters.set_counter(counter);
+        self
+    }
+}
+
+/// <span id="input-bench"></span> Benchmark over [generated inputs](Self::with_inputs).
+impl<'a, 'b, I, GenI> Bencher<'a, 'b, BencherConfig<GenI>>
+where
+    GenI: FnMut() -> I,
+{
+    /// Calls a closure to create a [`Counter`] for each input of the
+    /// benchmarked function.
+    ///
+    /// This will either:
+    /// - Assign a new counter
+    /// - Override an existing counter of the same type
+    ///
+    /// If the counter is constant, use [`Bencher::counter`] instead.
+    ///
+    /// When [benchmarking in parallel](macro@crate::bench#threads), the input
+    /// counter is called on the same thread as the sample loop that generates
+    /// and uses that input.
+    ///
+    /// # Examples
+    ///
+    /// The following example emits info for the number of bytes processed when
+    /// benchmarking [`char`-counting](std::str::Chars::count). The byte count
+    /// is gotten by calling [`BytesCount::of_str`] on each iteration's input
+    /// [`String`].
+    ///
+    /// ```
+    /// use divan::{Bencher, counter::BytesCount};
+    ///
+    /// #[divan::bench]
+    /// fn char_count(bencher: Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| -> String {
+    ///             // ...
+    ///             # String::new()
+    ///         })
+    ///         .input_counter(BytesCount::of_str)
+    ///         .bench_refs(|s| {
+    ///             s.chars().count()
+    ///         });
+    /// }
+    /// ```
+    pub fn input_counter<C, F>(self, make_counter: F) -> Self
+    where
+        F: Fn(&I) -> C + Sync + 'static,
+        C: IntoCounter,
+    {
+        self.context.counters.set_input_counter(make_counter);
+        self
+    }
+
+    /// Creates a [`Counter`] from each input of the benchmarked function.
+    ///
+    /// This may be used if the input returns [`u8`]–[`u64`], [`usize`], or any
+    /// nesting of references to those types.
+    ///
+    /// # Examples
+    ///
+    /// The following example emits info for the number of items processed when
+    /// benchmarking [`FromIterator`] from
+    /// <code>[Range](std::ops::Range)<[usize]></code> to [`Vec`].
+    ///
+    /// ```
+    /// use divan::{Bencher, counter::ItemsCount};
+    ///
+    /// #[divan::bench]
+    /// fn range_to_vec(bencher: Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| -> usize {
+    ///             // ...
+    ///             # 0
+    ///         })
+    ///         .count_inputs_as::<ItemsCount>()
+    ///         .bench_values(|n| -> Vec<usize> {
+    ///             (0..n).collect()
+    ///         });
+    /// }
+    /// ```
+    #[inline]
+    pub fn count_inputs_as<C>(self) -> Self
+    where
+        C: Counter,
+        I: AsCountUInt,
+    {
+        match KnownCounterKind::of::<C>() {
+            KnownCounterKind::Bytes => self.input_counter(|c| BytesCount::from(c)),
+            KnownCounterKind::Chars => self.input_counter(|c| CharsCount::from(c)),
+            KnownCounterKind::Cycles => self.input_counter(|c| CyclesCount::from(c)),
+            KnownCounterKind::Items => self.input_counter(|c| ItemsCount::from(c)),
+        }
+    }
+
+    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
+    /// provided by-value.
+    ///
+    /// Per-iteration means the benchmarked function is called exactly once for
+    /// each generated input.
+    ///
+    /// The function can be benchmarked in parallel using the [`threads`
+    /// option](macro@crate::bench#threads). If the function is strictly
+    /// single-threaded, use [`Bencher::bench_local_values`] instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| {
+    ///             // Generate input:
+    ///             String::from("...")
+    ///         })
+    ///         .bench_values(|s| {
+    ///             // Use input by-value:
+    ///             s + "123"
+    ///         });
+    /// }
+    /// ```
+    pub fn bench_values<O, B>(self, benched: B)
+    where
+        B: Fn(I) -> O + Sync,
+        GenI: Fn() -> I + Sync,
+    {
+        self.context.bench_loop_threaded(
+            self.config.gen_input,
+            |input| {
+                // SAFETY: Input is guaranteed to be initialized and not
+                // currently referenced by anything else.
+                let input = unsafe { input.get().read().assume_init() };
+
+                benched(input)
+            },
+            // Input ownership is transferred to `benched`.
+            |_input| {},
+        );
+    }
+
+    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
+    /// provided by-value.
+    ///
+    /// Per-iteration means the benchmarked function is called exactly once for
+    /// each generated input.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     let mut values = Vec::new();
+    ///     bencher
+    ///         .with_inputs(|| {
+    ///             // Generate input:
+    ///             String::from("...")
+    ///         })
+    ///         .bench_local_values(|s| {
+    ///             // Use input by-value:
+    ///             values.push(s);
+    ///         });
+    /// }
+    /// ```
+    pub fn bench_local_values<O, B>(self, mut benched: B)
+    where
+        B: FnMut(I) -> O,
+    {
+        self.context.bench_loop_local(
+            self.config.gen_input,
+            |input| {
+                // SAFETY: Input is guaranteed to be initialized and not
+                // currently referenced by anything else.
+                let input = unsafe { input.get().read().assume_init() };
+
+                benched(input)
+            },
+            // Input ownership is transferred to `benched`.
+            |_input| {},
+        );
+    }
+
+    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
+    /// provided by-reference.
+    ///
+    /// Per-iteration means the benchmarked function is called exactly once for
+    /// each generated input.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| {
+    ///             // Generate input:
+    ///             String::from("...")
+    ///         })
+    ///         .bench_refs(|s| {
+    ///             // Use input by-reference:
+    ///             *s += "123";
+    ///         });
+    /// }
+    /// ```
+    pub fn bench_refs<O, B>(self, benched: B)
+    where
+        B: Fn(&mut I) -> O + Sync,
+        GenI: Fn() -> I + Sync,
+    {
+        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
+        self.context.bench_loop_threaded(
+            self.config.gen_input,
+            |input| {
+                // SAFETY: Input is guaranteed to be initialized and not
+                // currently referenced by anything else.
+                let input = unsafe { (*input.get()).assume_init_mut() };
+
+                benched(input)
+            },
+            // Input ownership was not transferred to `benched`.
+            |input| {
+                // SAFETY: This function is called after `benched` outputs are
+                // dropped, so we have exclusive access.
+                unsafe { (*input.get()).assume_init_drop() }
+            },
+        );
+    }
+
+    /// Benchmarks a function over per-iteration [generated inputs](Self::with_inputs),
+    /// provided by-reference.
+    ///
+    /// Per-iteration means the benchmarked function is called exactly once for
+    /// each generated input.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #[divan::bench]
+    /// fn bench(bencher: divan::Bencher) {
+    ///     bencher
+    ///         .with_inputs(|| {
+    ///             // Generate input:
+    ///             String::from("...")
+    ///         })
+    ///         .bench_local_refs(|s| {
+    ///             // Use input by-reference:
+    ///             *s += "123";
+    ///         });
+    /// }
+    /// ```
+    pub fn bench_local_refs<O, B>(self, mut benched: B)
+    where
+        B: FnMut(&mut I) -> O,
+    {
+        // TODO: Allow `O` to reference `&mut I` as long as `I` outlives `O`.
+        self.context.bench_loop_local(
+            self.config.gen_input,
+            |input| {
+                // SAFETY: Input is guaranteed to be initialized and not
+                // currently referenced by anything else.
+                let input = unsafe { (*input.get()).assume_init_mut() };
+
+                benched(input)
+            },
+            // Input ownership was not transferred to `benched`.
+            |input| {
+                // SAFETY: This function is called after `benched` outputs are
+                // dropped, so we have exclusive access.
+                unsafe { (*input.get()).assume_init_drop() }
+            },
+        );
+    }
+}
+
+/// State machine for how the benchmark is being run.
+#[derive(Clone, Copy)]
+pub(crate) enum BenchMode {
+    /// The benchmark is being run as `--test`.
+    ///
+    /// Don't collect samples and run exactly once.
+    Test,
+
+    /// Scale `sample_size` to determine the right size for collecting.
+    Tune { sample_size: u32 },
+
+    /// Simply collect samples.
+    Collect { sample_size: u32 },
+}
+
+impl BenchMode {
+    #[inline]
+    pub fn is_test(self) -> bool {
+        matches!(self, Self::Test)
+    }
+
+    #[inline]
+    pub fn is_tune(self) -> bool {
+        matches!(self, Self::Tune { .. })
+    }
+
+    #[inline]
+    pub fn is_collect(self) -> bool {
+        matches!(self, Self::Collect { .. })
+    }
+
+    #[inline]
+    pub fn sample_size(self) -> u32 {
+        match self {
+            Self::Test => 1,
+            Self::Tune { sample_size, .. } | Self::Collect { sample_size, .. } => sample_size,
+        }
+    }
+}
+
+/// `#[divan::bench]` loop context.
+///
+/// Functions called within the benchmark loop should be `#[inline(always)]` to
+/// ensure instruction cache locality.
+pub(crate) struct BenchContext<'a> {
+    shared_context: &'a SharedContext,
+
+    /// User-configured options.
+    pub options: &'a BenchOptions<'a>,
+
+    /// Whether the benchmark loop was started.
+    pub did_run: bool,
+
+    /// The number of threads to run the benchmark. The default is 1.
+    ///
+    /// When set to 1, the benchmark loop is guaranteed to stay on the current
+    /// thread and not spawn any threads.
+    pub thread_count: NonZeroUsize,
+
+    /// Recorded samples.
+    pub samples: SampleCollection,
+
+    /// Per-iteration counters grouped by sample.
+    counters: CounterCollection,
+}
+
+impl<'a> BenchContext<'a> {
+    /// Creates a new benchmarking context.
+    pub fn new(
+        shared_context: &'a SharedContext,
+        options: &'a BenchOptions,
+        thread_count: NonZeroUsize,
+    ) -> Self {
+        Self {
+            shared_context,
+            options,
+            thread_count,
+            did_run: false,
+            samples: SampleCollection::default(),
+            counters: options.counters.to_collection(),
+        }
+    }
+
+    /// Runs the single-threaded loop for benchmarking `benched`.
+    ///
+    /// # Safety
+    ///
+    /// See `bench_loop_threaded`.
+    pub fn bench_loop_local<I, O>(
+        &mut self,
+        gen_input: impl FnMut() -> I,
+        benched: impl FnMut(&UnsafeCell<MaybeUninit<I>>) -> O,
+        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
+    ) {
+        // SAFETY: Closures are guaranteed to run on the current thread, so they
+        // can safely be mutable and non-`Sync`.
+        unsafe {
+            let gen_input = SyncWrap::new(UnsafeCell::new(gen_input));
+            let benched = SyncWrap::new(UnsafeCell::new(benched));
+            let drop_input = SyncWrap::new(drop_input);
+
+            self.thread_count = NonZeroUsize::MIN;
+            self.bench_loop_threaded::<I, O>(
+                || (*gen_input.get())(),
+                |input| (*benched.get())(input),
+                |input| drop_input(input),
+            )
+        }
+    }
+
+    /// Runs the multi-threaded loop for benchmarking `benched`.
+    ///
+    /// # Safety
+    ///
+    /// If `self.threads` is 1, the incoming closures will not escape the
+    /// current thread. This guarantee ensures `bench_loop_local` can soundly
+    /// reuse this method with mutable non-`Sync` closures.
+    ///
+    /// When `benched` is called:
+    /// - `I` is guaranteed to be initialized.
+    /// - No external `&I` or `&mut I` exists.
+    ///
+    /// When `drop_input` is called:
+    /// - All instances of `O` returned from `benched` have been dropped.
+    /// - The same guarantees for `I` apply as in `benched`, unless `benched`
+    ///   escaped references to `I`.
+    fn bench_loop_threaded<I, O>(
+        &mut self,
+        gen_input: impl Fn() -> I + Sync,
+        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O + Sync,
+        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>) + Sync,
+    ) {
+        self.did_run = true;
+
+        let mut current_mode = self.initial_mode();
+        let is_test = current_mode.is_test();
+
+        let record_sample = self.sample_recorder(gen_input, benched, drop_input);
+
+        let thread_count = self.thread_count.get();
+        let aux_thread_count = thread_count - 1;
+
+        let is_single_thread = aux_thread_count == 0;
+
+        // Per-thread sample info returned by `record_sample`. These are
+        // processed locally to emit user-facing sample info. As a result, this
+        // only contains `thread_count` many elements at a time.
+        let mut raw_samples = Vec::<Option<RawSample>>::new();
+
+        // The time spent benchmarking, in picoseconds.
+        //
+        // Unless `skip_ext_time` is set, this includes time external to
+        // `benched`, such as time spent generating inputs and running drop.
+        let mut elapsed_picos: u128 = 0;
+
+        // The minimum time for benchmarking, in picoseconds.
+        let min_picos = self.options.min_time().picos;
+
+        // The remaining time left for benchmarking, in picoseconds.
+        let max_picos = self.options.max_time().picos;
+
+        // Don't bother running if user specifies 0 max time or 0 samples.
+        if max_picos == 0 || !self.options.has_samples() {
+            return;
+        }
+
+        let timer = self.shared_context.timer;
+        let timer_kind = timer.kind();
+
+        let mut rem_samples = if current_mode.is_collect() {
+            Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT))
+        } else {
+            None
+        };
+
+        // Only measure precision if we need to tune sample size.
+        let timer_precision =
+            if current_mode.is_tune() { timer.precision() } else { FineDuration::default() };
+
+        if !is_test {
+            self.samples.time_samples.reserve(self.options.sample_count.unwrap_or(1) as usize);
+        }
+
+        let skip_ext_time = self.options.skip_ext_time.unwrap_or_default();
+        let initial_start = if skip_ext_time { None } else { Some(Timestamp::start(timer_kind)) };
+
+        let bench_overheads = timer.bench_overheads();
+
+        while {
+            // Conditions for when sampling is over:
+            if elapsed_picos >= max_picos {
+                // Depleted the benchmarking time budget. This is a strict
+                // condition regardless of sample count and minimum time.
+                false
+            } else if rem_samples.unwrap_or(1) > 0 {
+                // More samples expected.
+                true
+            } else {
+                // Continue if we haven't reached the time floor.
+                elapsed_picos < min_picos
+            }
+        } {
+            let sample_size = current_mode.sample_size();
+            self.samples.sample_size = sample_size;
+
+            let barrier = if is_single_thread { None } else { Some(Barrier::new(thread_count)) };
+
+            // Sample loop helper:
+            let record_sample = || -> RawSample {
+                let mut counter_totals: [u128; KnownCounterKind::COUNT] =
+                    [0; KnownCounterKind::COUNT];
+
+                // Updates per-input counter info for this sample.
+                let mut count_input = |input: &I| {
+                    for counter_kind in KnownCounterKind::ALL {
+                        // SAFETY: The `I` type cannot change since `with_inputs`
+                        // cannot be called more than once on the same `Bencher`.
+                        if let Some(count) =
+                            unsafe { self.counters.get_input_count(counter_kind, input) }
+                        {
+                            let total = &mut counter_totals[counter_kind as usize];
+                            *total = (*total).saturating_add(count as u128);
+                        }
+                    }
+                };
+
+                // Sample loop:
+                let ([start, end], alloc_info) =
+                    record_sample(sample_size as usize, barrier.as_ref(), &mut count_input);
+
+                RawSample { start, end, timer, alloc_info, counter_totals }
+            };
+
+            // Sample loop:
+            raw_samples.clear();
+            BENCH_POOL.par_extend(&mut raw_samples, aux_thread_count, |_| record_sample());
+
+            // Convert `&[Option<RawSample>]` to `&[Sample]`.
+            let raw_samples: &[RawSample] = {
+                if let Some(thread) = raw_samples
+                    .iter()
+                    .enumerate()
+                    .find_map(|(thread, sample)| sample.is_none().then_some(thread))
+                {
+                    panic!("Divan benchmarking thread {thread} panicked");
+                }
+
+                unsafe {
+                    assert_eq!(mem::size_of::<RawSample>(), mem::size_of::<Option<RawSample>>());
+                    std::slice::from_raw_parts(raw_samples.as_ptr().cast(), raw_samples.len())
+                }
+            };
+
+            // If testing, exit the benchmarking loop immediately after timing a
+            // single run.
+            if is_test {
+                break;
+            }
+
+            let slowest_sample = raw_samples.iter().max_by_key(|s| s.duration()).unwrap();
+            let slowest_time = slowest_sample.duration();
+
+            // TODO: Make tuning be less influenced by early runs. Currently if
+            // early runs are very quick but later runs are slow, benchmarking
+            // will take a very long time.
+            //
+            // TODO: Make `sample_size` consider time generating inputs and
+            // dropping inputs/outputs. Currently benchmarks like
+            // `Bencher::bench_refs(String::clear)` take a very long time.
+            if current_mode.is_tune() {
+                // Clear previous smaller samples.
+                self.samples.clear();
+                self.counters.clear_input_counts();
+
+                // If within 100x timer precision, continue tuning.
+                let precision_multiple = slowest_time.picos / timer_precision.picos;
+                if precision_multiple <= 100 {
+                    current_mode = BenchMode::Tune { sample_size: sample_size * 2 };
+                } else {
+                    current_mode = BenchMode::Collect { sample_size };
+                    rem_samples = Some(self.options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT));
+                }
+            }
+
+            // Returns the sample's duration adjusted for overhead.
+            let sample_duration_sub_overhead = |raw_sample: &RawSample| {
+                let overhead = bench_overheads.total_overhead(sample_size, &raw_sample.alloc_info);
+
+                FineDuration {
+                    picos: raw_sample
+                        .duration()
+                        .clamp_to(timer_precision)
+                        .picos
+                        .saturating_sub(overhead.picos),
+                }
+                .clamp_to(timer_precision)
+            };
+
+            for raw_sample in raw_samples {
+                let sample_index = self.samples.time_samples.len();
+
+                self.samples
+                    .time_samples
+                    .push(TimeSample { duration: sample_duration_sub_overhead(raw_sample) });
+
+                if !raw_sample.alloc_info.tallies.is_empty() {
+                    self.samples
+                        .alloc_info_by_sample
+                        .insert(sample_index as u32, raw_sample.alloc_info.clone());
+                }
+
+                // Insert per-input counter information.
+                for counter_kind in KnownCounterKind::ALL {
+                    if !self.counters.uses_input_counts(counter_kind) {
+                        continue;
+                    }
+
+                    let total_count = raw_sample.counter_totals[counter_kind as usize];
+
+                    // Cannot overflow `MaxCountUInt` because `total_count`
+                    // cannot exceed `MaxCountUInt::MAX * sample_size`.
+                    let per_iter_count = (total_count / sample_size as u128) as MaxCountUInt;
+
+                    self.counters.push_counter(AnyCounter::known(counter_kind, per_iter_count));
+                }
+
+                if let Some(rem_samples) = &mut rem_samples {
+                    *rem_samples = rem_samples.saturating_sub(1);
+                }
+            }
+
+            if let Some(initial_start) = initial_start {
+                let last_end = raw_samples.iter().map(|s| s.end).max().unwrap();
+                elapsed_picos = last_end.duration_since(initial_start, timer).picos;
+            } else {
+                // Progress by at least 1ns to prevent extremely fast
+                // functions from taking forever when `min_time` is set.
+                let progress_picos = slowest_time.picos.max(1_000);
+                elapsed_picos = elapsed_picos.saturating_add(progress_picos);
+            }
+        }
+
+        // Reset flag for ignoring allocations.
+        crate::alloc::IGNORE_ALLOC.set(false);
+    }
+
+    /// Returns a closure that takes the sample size and input counter, and then
+    /// returns a newly recorded sample.
+    fn sample_recorder<I, O>(
+        &self,
+        gen_input: impl Fn() -> I,
+        benched: impl Fn(&UnsafeCell<MaybeUninit<I>>) -> O,
+        drop_input: impl Fn(&UnsafeCell<MaybeUninit<I>>),
+    ) -> impl Fn(usize, Option<&Barrier>, &mut dyn FnMut(&I)) -> ([Timestamp; 2], ThreadAllocInfo)
+    {
+        // We defer:
+        // - Usage of `gen_input` values.
+        // - Drop destructor for `O`, preventing it from affecting sample
+        //   measurements. Outputs are stored into a pre-allocated buffer during
+        //   the sample loop. The allocation is reused between samples to reduce
+        //   time spent between samples.
+
+        let timer_kind = self.shared_context.timer.kind();
+
+        move |sample_size: usize, barrier: Option<&Barrier>, count_input: &mut dyn FnMut(&I)| {
+            let mut defer_store = DeferStore::<I, O>::default();
+
+            let mut saved_alloc_info = ThreadAllocInfo::new();
+            let mut save_alloc_info = || {
+                if crate::alloc::IGNORE_ALLOC.get() {
+                    return;
+                }
+
+                if let Some(alloc_info) = ThreadAllocInfo::try_current() {
+                    // SAFETY: We have exclusive access.
+                    saved_alloc_info = unsafe { alloc_info.as_ptr().read() };
+                }
+            };
+
+            // Synchronize all threads to start timed section simultaneously and
+            // clear every thread's memory profiling info.
+            //
+            // This ensures work external to the timed section does not affect
+            // the timing of other threads.
+            let sync_threads = |is_start: bool| {
+                sync_impl(barrier, is_start);
+
+                // Monomorphize implementation to reduce code size.
+                #[inline(never)]
+                fn sync_impl(barrier: Option<&Barrier>, is_start: bool) {
+                    // Ensure benchmarked section has a `ThreadAllocInfo`
+                    // allocated for the current thread and clear previous info.
+                    let alloc_info = if is_start { ThreadAllocInfo::current() } else { None };
+
+                    // Synchronize all threads.
+                    //
+                    // This is the final synchronization point for the end.
+                    if let Some(barrier) = barrier {
+                        barrier.wait();
+                    }
+
+                    if let Some(mut alloc_info) = alloc_info {
+                        // SAFETY: We have exclusive access.
+                        let alloc_info = unsafe { alloc_info.as_mut() };
+
+                        alloc_info.clear();
+
+                        // Synchronize all threads.
+                        if let Some(barrier) = barrier {
+                            barrier.wait();
+                        }
+                    }
+                }
+            };
+
+            // The following logic chooses how to efficiently sample the
+            // benchmark function once and assigns `sample_start`/`sample_end`
+            // before/after the sample loop.
+            //
+            // NOTE: Testing and benchmarking should behave exactly the same
+            // when getting the sample time span. We don't want to introduce
+            // extra work that may worsen measurement quality for real
+            // benchmarking.
+            let sample_start: UntaggedTimestamp;
+            let sample_end: UntaggedTimestamp;
+
+            if size_of::<I>() == 0 && (size_of::<O>() == 0 || !mem::needs_drop::<O>()) {
+                // Use a range instead of `defer_store` to make the benchmarking
+                // loop cheaper.
+
+                // Run `gen_input` the expected number of times in case it
+                // updates external state used by `benched`.
+                for _ in 0..sample_size {
+                    let input = gen_input();
+                    count_input(&input);
+
+                    // Inputs are consumed/dropped later.
+                    mem::forget(input);
+                }
+
+                sync_threads(true);
+                sample_start = UntaggedTimestamp::start(timer_kind);
+
+                // Sample loop:
+                for _ in 0..sample_size {
+                    // SAFETY: Input is a ZST, so we can construct one out of
+                    // thin air.
+                    let input = unsafe { UnsafeCell::new(MaybeUninit::<I>::zeroed()) };
+
+                    mem::forget(black_box(benched(&input)));
+                }
+
+                sample_end = UntaggedTimestamp::end(timer_kind);
+                sync_threads(false);
+                save_alloc_info();
+
+                // Drop outputs and inputs.
+                for _ in 0..sample_size {
+                    // Output only needs drop if ZST.
+                    if size_of::<O>() == 0 {
+                        // SAFETY: Output is a ZST, so we can construct one out
+                        // of thin air.
+                        unsafe { _ = mem::zeroed::<O>() }
+                    }
+
+                    if mem::needs_drop::<I>() {
+                        // SAFETY: Input is a ZST, so we can construct one out
+                        // of thin air and not worry about aliasing.
+                        unsafe { drop_input(&UnsafeCell::new(MaybeUninit::<I>::zeroed())) }
+                    }
+                }
+            } else {
+                defer_store.prepare(sample_size);
+
+                match defer_store.slots() {
+                    // Output needs to be dropped. We defer drop in the sample
+                    // loop by inserting it into `defer_store`.
+                    Ok(defer_slots_slice) => {
+                        // Initialize and store inputs.
+                        for DeferSlot { input, .. } in defer_slots_slice {
+                            // SAFETY: We have exclusive access to `input`.
+                            let input = unsafe { &mut *input.get() };
+                            let input = input.write(gen_input());
+                            count_input(input);
+
+                            // Make input opaque to benchmarked function.
+                            black_box(input);
+                        }
+
+                        // Create iterator before the sample timing section to
+                        // reduce benchmarking overhead.
+                        let defer_slots_iter = defer_slots_slice.iter();
+
+                        sync_threads(true);
+                        sample_start = UntaggedTimestamp::start(timer_kind);
+
+                        // Sample loop:
+                        for defer_slot in defer_slots_iter {
+                            // SAFETY: All inputs in `defer_store` were
+                            // initialized and we have exclusive access to the
+                            // output slot.
+                            unsafe {
+                                let output = benched(&defer_slot.input);
+                                *defer_slot.output.get() = MaybeUninit::new(output);
+                            }
+                        }
+
+                        sample_end = UntaggedTimestamp::end(timer_kind);
+                        sync_threads(false);
+                        save_alloc_info();
+
+                        // Prevent the optimizer from removing writes to inputs
+                        // and outputs in the sample loop.
+                        black_box(defer_slots_slice);
+
+                        // Drop outputs and inputs.
+                        for DeferSlot { input, output } in defer_slots_slice {
+                            // SAFETY: All outputs were initialized in the
+                            // sample loop and we have exclusive access.
+                            unsafe { (*output.get()).assume_init_drop() }
+
+                            if mem::needs_drop::<I>() {
+                                // SAFETY: The output was dropped and thus we
+                                // have exclusive access to inputs.
+                                unsafe { drop_input(input) }
+                            }
+                        }
+                    }
+
+                    // Output does not need to be dropped.
+                    Err(defer_inputs_slice) => {
+                        // Initialize and store inputs.
+                        for input in defer_inputs_slice {
+                            // SAFETY: We have exclusive access to `input`.
+                            let input = unsafe { &mut *input.get() };
+                            let input = input.write(gen_input());
+                            count_input(input);
+
+                            // Make input opaque to benchmarked function.
+                            black_box(input);
+                        }
+
+                        // Create iterator before the sample timing section to
+                        // reduce benchmarking overhead.
+                        let defer_inputs_iter = defer_inputs_slice.iter();
+
+                        sync_threads(true);
+                        sample_start = UntaggedTimestamp::start(timer_kind);
+
+                        // Sample loop:
+                        for input in defer_inputs_iter {
+                            // SAFETY: All inputs in `defer_store` were
+                            // initialized.
+                            black_box_drop(unsafe { benched(input) });
+                        }
+
+                        sample_end = UntaggedTimestamp::end(timer_kind);
+                        sync_threads(false);
+                        save_alloc_info();
+
+                        // Prevent the optimizer from removing writes to inputs
+                        // in the sample loop.
+                        black_box(defer_inputs_slice);
+
+                        // Drop inputs.
+                        if mem::needs_drop::<I>() {
+                            for input in defer_inputs_slice {
+                                // SAFETY: We have exclusive access to inputs.
+                                unsafe { drop_input(input) }
+                            }
+                        }
+                    }
+                }
+            }
+
+            // SAFETY: These values are guaranteed to be the correct variant
+            // because they were created from the same `timer_kind`.
+            let interval = unsafe {
+                [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
+            };
+
+            (interval, saved_alloc_info)
+        }
+    }
+
+    #[inline]
+    fn initial_mode(&self) -> BenchMode {
+        if self.shared_context.action.is_test() {
+            BenchMode::Test
+        } else if let Some(sample_size) = self.options.sample_size {
+            BenchMode::Collect { sample_size }
+        } else {
+            BenchMode::Tune { sample_size: 1 }
+        }
+    }
+
+    pub fn compute_stats(&self) -> Stats {
+        let time_samples = &self.samples.time_samples;
+        let alloc_info_by_sample = &self.samples.alloc_info_by_sample;
+
+        let sample_count = time_samples.len();
+        let sample_size = self.samples.sample_size;
+
+        let total_count = self.samples.iter_count();
+
+        let total_duration = self.samples.total_duration();
+        let mean_duration = FineDuration {
+            picos: total_duration.picos.checked_div(total_count as u128).unwrap_or_default(),
+        };
+
+        // Samples sorted by duration.
+        let sorted_samples = self.samples.sorted_samples();
+        let median_samples = util::slice_middle(&sorted_samples);
+
+        let index_of_sample = |sample: &TimeSample| -> usize {
+            util::slice_ptr_index(&self.samples.time_samples, sample)
+        };
+
+        let counter_count_for_sample =
+            |sample: &TimeSample, counter_kind: KnownCounterKind| -> Option<MaxCountUInt> {
+                let counts = self.counters.counts(counter_kind);
+
+                let index = if self.counters.uses_input_counts(counter_kind) {
+                    index_of_sample(sample)
+                } else {
+                    0
+                };
+
+                counts.get(index).copied()
+            };
+
+        let min_duration =
+            sorted_samples.first().map(|s| s.duration / sample_size).unwrap_or_default();
+        let max_duration =
+            sorted_samples.last().map(|s| s.duration / sample_size).unwrap_or_default();
+
+        let median_duration = if median_samples.is_empty() {
+            FineDuration::default()
+        } else {
+            let sum: u128 = median_samples.iter().map(|s| s.duration.picos).sum();
+            FineDuration { picos: sum / median_samples.len() as u128 } / sample_size
+        };
+
+        let counts = KnownCounterKind::ALL.map(|counter_kind| {
+            let median: MaxCountUInt = {
+                let mut sum: u128 = 0;
+
+                for sample in median_samples {
+                    let sample_count = counter_count_for_sample(sample, counter_kind)? as u128;
+
+                    // Saturating add in case `MaxUIntCount > u64`.
+                    sum = sum.saturating_add(sample_count);
+                }
+
+                (sum / median_samples.len() as u128) as MaxCountUInt
+            };
+
+            Some(StatsSet {
+                fastest: sorted_samples
+                    .first()
+                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
+                slowest: sorted_samples
+                    .last()
+                    .and_then(|s| counter_count_for_sample(s, counter_kind))?,
+                median,
+                mean: self.counters.mean_count(counter_kind),
+            })
+        });
+
+        let sample_alloc_info = |sample: Option<&TimeSample>| -> Option<&ThreadAllocInfo> {
+            sample
+                .and_then(|sample| u32::try_from(index_of_sample(sample)).ok())
+                .and_then(|index| self.samples.alloc_info_by_sample.get(&index))
+        };
+
+        let sample_alloc_tally = |sample: Option<&TimeSample>, op: AllocOp| -> ThreadAllocTally {
+            sample_alloc_info(sample)
+                .map(|alloc_info| alloc_info.tallies.get(op))
+                .copied()
+                .unwrap_or_default()
+        };
+
+        let mut alloc_total_max_count = 0u128;
+        let mut alloc_total_max_size = 0u128;
+        let mut alloc_total_tallies = TotalAllocTallyMap::default();
+
+        for alloc_info in alloc_info_by_sample.values() {
+            alloc_total_max_count += alloc_info.max_count as u128;
+            alloc_total_max_size += alloc_info.max_size as u128;
+            alloc_info.tallies.add_to_total(&mut alloc_total_tallies);
+        }
+
+        let sample_size = f64::from(sample_size);
+        Stats {
+            sample_count: sample_count as u32,
+            iter_count: total_count,
+            time: StatsSet {
+                fastest: min_duration,
+                slowest: max_duration,
+                median: median_duration,
+                mean: mean_duration,
+            },
+            max_alloc: StatsSet {
+                fastest: {
+                    let alloc_info = sample_alloc_info(sorted_samples.first().copied());
+
+                    AllocTally {
+                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
+                            / sample_size,
+                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
+                            / sample_size,
+                    }
+                },
+                slowest: {
+                    let alloc_info = sample_alloc_info(sorted_samples.last().copied());
+
+                    AllocTally {
+                        count: alloc_info.map(|info| info.max_count as f64).unwrap_or_default()
+                            / sample_size,
+                        size: alloc_info.map(|info| info.max_size as f64).unwrap_or_default()
+                            / sample_size,
+                    }
+                },
+                // TODO: Switch to median of alloc info itself, rather than
+                // basing off of median times.
+                median: {
+                    let alloc_info_for_median =
+                        |index| sample_alloc_info(median_samples.get(index).copied());
+
+                    let max_count_for_median = |index: usize| -> f64 {
+                        alloc_info_for_median(index)
+                            .map(|info| info.max_count as f64)
+                            .unwrap_or_default()
+                    };
+
+                    let max_size_for_median = |index: usize| -> f64 {
+                        alloc_info_for_median(index)
+                            .map(|info| info.max_size as f64)
+                            .unwrap_or_default()
+                    };
+
+                    let median_count = median_samples.len().max(1) as f64;
+
+                    let median_max_count = max_count_for_median(0) + max_count_for_median(1);
+                    let median_max_size = max_size_for_median(0) + max_size_for_median(1);
+
+                    AllocTally {
+                        count: median_max_count / median_count / sample_size,
+                        size: median_max_size / median_count / sample_size,
+                    }
+                },
+                mean: AllocTally {
+                    count: alloc_total_max_count as f64 / total_count as f64,
+                    size: alloc_total_max_size as f64 / total_count as f64,
+                },
+            }
+            .transpose(),
+            alloc_tallies: AllocOpMap {
+                values: AllocOp::ALL
+                    .map(|op| StatsSet {
+                        fastest: {
+                            let fastest = sample_alloc_tally(sorted_samples.first().copied(), op);
+
+                            AllocTally {
+                                count: fastest.count as f64 / sample_size,
+                                size: fastest.size as f64 / sample_size,
+                            }
+                        },
+                        slowest: {
+                            let slowest = sample_alloc_tally(sorted_samples.last().copied(), op);
+
+                            AllocTally {
+                                count: slowest.count as f64 / sample_size,
+                                size: slowest.size as f64 / sample_size,
+                            }
+                        },
+                        median: {
+                            let tally_for_median = |index: usize| -> ThreadAllocTally {
+                                sample_alloc_tally(median_samples.get(index).copied(), op)
+                            };
+
+                            let a = tally_for_median(0);
+                            let b = tally_for_median(1);
+
+                            let median_count = median_samples.len().max(1) as f64;
+
+                            let avg_count = (a.count as f64 + b.count as f64) / median_count;
+                            let avg_size = (a.size as f64 + b.size as f64) / median_count;
+
+                            AllocTally {
+                                count: avg_count / sample_size,
+                                size: avg_size / sample_size,
+                            }
+                        },
+                        mean: {
+                            let tally = alloc_total_tallies.get(op);
+                            AllocTally {
+                                count: tally.count as f64 / total_count as f64,
+                                size: tally.size as f64 / total_count as f64,
+                            }
+                        },
+                    })
+                    .map(StatsSet::transpose),
+            },
+            counts,
+        }
+    }
+}
+
+impl<T> StatsSet<AllocTally<T>> {
+    #[inline]
+    pub fn transpose(self) -> AllocTally<StatsSet<T>> {
+        AllocTally {
+            count: StatsSet {
+                fastest: self.fastest.count,
+                slowest: self.slowest.count,
+                median: self.median.count,
+                mean: self.mean.count,
+            },
+            size: StatsSet {
+                fastest: self.fastest.size,
+                slowest: self.slowest.size,
+                median: self.median.size,
+                mean: self.mean.size,
+            },
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/bench/options.rs b/crates/divan_compat/divan_fork/src/bench/options.rs
new file mode 100644
index 00000000..e4f7f96b
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/bench/options.rs
@@ -0,0 +1,85 @@
+use std::{borrow::Cow, time::Duration};
+
+use crate::{counter::CounterSet, time::FineDuration};
+
+/// Benchmarking options set directly by the user in `#[divan::bench]` and
+/// `#[divan::bench_group]`.
+///
+/// Changes to fields must be reflected in the "Options" sections of the docs
+/// for `#[divan::bench]` and `#[divan::bench_group]`.
+#[derive(Clone, Default)]
+pub struct BenchOptions<'a> {
+    /// The number of sample recordings.
+    pub sample_count: Option<u32>,
+
+    /// The number of iterations inside a single sample.
+    pub sample_size: Option<u32>,
+
+    /// The number of threads to benchmark the sample. This is 1 by default.
+    ///
+    /// If set to 0, this will use [`std::thread::available_parallelism`].
+    ///
+    /// We use `&'static [usize]` by leaking the input because `BenchOptions` is
+    /// cached on first retrieval.
+    pub threads: Option<Cow<'a, [usize]>>,
+
+    /// Counts the number of values processed each iteration of a benchmarked
+    /// function.
+    pub counters: CounterSet,
+
+    /// The time floor for benchmarking a function.
+    pub min_time: Option<Duration>,
+
+    /// The time ceiling for benchmarking a function.
+    pub max_time: Option<Duration>,
+
+    /// When accounting for `min_time` or `max_time`, skip time external to
+    /// benchmarked functions, such as time spent generating inputs and running
+    /// [`Drop`].
+    pub skip_ext_time: Option<bool>,
+
+    /// Whether the benchmark should be ignored.
+    ///
+    /// This may be set within the attribute or with a separate
+    /// [`#[ignore]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-ignore-attribute).
+    pub ignore: Option<bool>,
+}
+
+impl<'a> BenchOptions<'a> {
+    /// Overwrites `other` with values set in `self`.
+    #[must_use]
+    pub(crate) fn overwrite<'b>(&'b self, other: &'b Self) -> Self
+    where
+        'b: 'a,
+    {
+        Self {
+            // `Copy` values:
+            sample_count: self.sample_count.or(other.sample_count),
+            sample_size: self.sample_size.or(other.sample_size),
+            threads: self.threads.as_deref().or(other.threads.as_deref()).map(Cow::Borrowed),
+            min_time: self.min_time.or(other.min_time),
+            max_time: self.max_time.or(other.max_time),
+            skip_ext_time: self.skip_ext_time.or(other.skip_ext_time),
+            ignore: self.ignore.or(other.ignore),
+
+            // `Clone` values:
+            counters: self.counters.overwrite(&other.counters),
+        }
+    }
+
+    /// Returns `true` if non-zero samples are specified.
+    #[inline]
+    pub(crate) fn has_samples(&self) -> bool {
+        self.sample_count != Some(0) && self.sample_size != Some(0)
+    }
+
+    #[inline]
+    pub(crate) fn min_time(&self) -> FineDuration {
+        self.min_time.map(FineDuration::from).unwrap_or_default()
+    }
+
+    #[inline]
+    pub(crate) fn max_time(&self) -> FineDuration {
+        self.max_time.map(FineDuration::from).unwrap_or(FineDuration::MAX)
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/bench/tests.rs b/crates/divan_compat/divan_fork/src/bench/tests.rs
new file mode 100644
index 00000000..22f006f1
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/bench/tests.rs
@@ -0,0 +1,544 @@
+//! Tests every benchmarking loop combination in `Bencher`. When run under Miri,
+//! this catches memory leaks and UB in `unsafe` code.
+
+use std::{
+    collections::HashSet,
+    sync::atomic::{AtomicUsize, Ordering::SeqCst},
+};
+
+use util::defer;
+
+use super::*;
+use crate::{
+    config::Action,
+    time::{Timer, TimerKind},
+};
+
+// We use a small number of runs because Miri is very slow.
+const SAMPLE_COUNT: u32 = 3;
+
+const SAMPLE_SIZE: u32 = 2;
+
+// Tests `SAMPLE_COUNT` by including it in the middle and having higher numbers
+// where `SAMPLE_COUNT % n != 0`.
+const THREAD_COUNTS: &[usize] = if cfg!(miri) {
+    // Speed up Miri tests while still catching UB/memory issues.
+    &[1, 2]
+} else {
+    // Exhaustively test expectations.
+    //
+    // Tests `SAMPLE_COUNT` by:
+    // - Including it in the middle
+    // - Having numbers where `SAMPLE_COUNT % n` varies
+    &[1, 2, 3, 4, 5, 6, 9]
+};
+
+#[track_caller]
+fn test_bencher(test: &mut dyn FnMut(Bencher)) {
+    // Silence Miri about leaking threads.
+    let _drop_threads = defer(|| BENCH_POOL.drop_threads());
+
+    let bench_options = BenchOptions {
+        sample_count: Some(SAMPLE_COUNT),
+        sample_size: Some(SAMPLE_SIZE),
+        ..BenchOptions::default()
+    };
+
+    for timer in Timer::available() {
+        for action in [Action::Bench, Action::Test] {
+            let shared_context = SharedContext { action, timer };
+
+            for &thread_count in THREAD_COUNTS {
+                let mut bench_context = BenchContext::new(
+                    &shared_context,
+                    &bench_options,
+                    NonZeroUsize::new(thread_count).unwrap(),
+                );
+
+                test(Bencher::new(&mut bench_context));
+
+                assert!(bench_context.did_run);
+
+                let samples = &bench_context.samples;
+
+                // '--test' should run the expected number of times but not
+                // allocate any samples.
+                if action.is_test() {
+                    assert_eq!(samples.time_samples.capacity(), 0);
+                }
+            }
+        }
+    }
+}
+
+fn make_string() -> String {
+    ('a'..='z').collect()
+}
+
+/// Tests that the benchmarked function runs the expected number of times when
+/// running either in benchmark or test mode.
+///
+/// Tests operate over all input/output combinations of:
+/// - `()`
+/// - `i32`
+/// - `String`
+/// - Zero sized type (ZST) that implements `Drop`
+///
+/// This ensures that any special handling of `size_of` or `needs_drop` does not
+/// affect the number of runs.
+#[allow(clippy::unused_unit)]
+mod run_count {
+    use super::*;
+
+    fn test(run_bench: fn(Bencher, &(dyn Fn() + Sync))) {
+        test_with_drop_counter(&AtomicUsize::new(usize::MAX), run_bench);
+    }
+
+    fn test_with_drop_counter(
+        drop_count: &AtomicUsize,
+        run_bench: fn(Bencher, &(dyn Fn() + Sync)),
+    ) {
+        let test_drop_count = drop_count.load(SeqCst) != usize::MAX;
+
+        let bench_count = AtomicUsize::new(0);
+        let test_count = AtomicUsize::new(0);
+
+        let mut thread_counts = HashSet::<u32>::new();
+        let mut timer_os = false;
+        let mut timer_tsc = false;
+
+        test_bencher(&mut |bencher| {
+            let context = &bencher.context;
+
+            let thread_count = context.thread_count.get();
+            thread_counts.insert(thread_count as u32);
+
+            match context.shared_context.timer.kind() {
+                TimerKind::Os => timer_os = true,
+                TimerKind::Tsc => timer_tsc = true,
+            }
+
+            let is_test = context.shared_context.action.is_test();
+
+            let shared_run_count = if is_test { &test_count } else { &bench_count };
+            let start_run_count = shared_run_count.load(SeqCst);
+
+            run_bench(bencher, &|| {
+                shared_run_count.fetch_add(1, SeqCst);
+            });
+
+            let end_run_count = shared_run_count.load(SeqCst);
+            let run_count = end_run_count - start_run_count;
+
+            if is_test {
+                assert_eq!(run_count, thread_count);
+            } else {
+                let expected_samples = match SAMPLE_COUNT as usize % thread_count {
+                    0 => SAMPLE_COUNT,
+                    rem => SAMPLE_COUNT + (thread_count - rem) as u32,
+                };
+
+                let expected_iters = (expected_samples * SAMPLE_SIZE) as usize;
+                assert_eq!(run_count, expected_iters);
+            }
+        });
+
+        let thread_count = thread_counts.into_iter().sum::<u32>();
+
+        let timer_count = timer_os as u32 + timer_tsc as u32;
+        let bench_count = bench_count.into_inner() as u32;
+        let test_count = test_count.into_inner() as u32;
+
+        let total_count = bench_count + test_count;
+        assert_ne!(total_count, 0);
+
+        // The drop count should equal the total run count.
+        if test_drop_count {
+            assert_eq!(drop_count.load(SeqCst), total_count as usize);
+        }
+
+        assert_eq!(test_count, timer_count * thread_count);
+    }
+
+    #[test]
+    fn bench() {
+        struct DroppedZst;
+
+        static ZST_DROP_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        impl Drop for DroppedZst {
+            fn drop(&mut self) {
+                ZST_DROP_COUNT.fetch_add(1, SeqCst);
+            }
+        }
+
+        // `()` out.
+        test(|b, f| b.bench(f));
+
+        // `i32` out.
+        test(|b, f| {
+            b.bench(|| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `String` out.
+        test(|b, f| {
+            b.bench(|| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `DroppedZst` out.
+        test_with_drop_counter(&ZST_DROP_COUNT, |b, f| {
+            b.bench(|| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+    }
+
+    #[test]
+    fn bench_values() {
+        struct DroppedZst;
+
+        static ZST_DROP_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        impl Drop for DroppedZst {
+            fn drop(&mut self) {
+                ZST_DROP_COUNT.fetch_add(1, SeqCst);
+            }
+        }
+
+        let test_zst_drop = |run_bench| {
+            ZST_DROP_COUNT.store(0, SeqCst);
+            test_with_drop_counter(&ZST_DROP_COUNT, run_bench);
+        };
+
+        // `()` in, `()` out.
+        test(|b, f| b.with_inputs(|| ()).bench_values(|_: ()| -> () { f() }));
+
+        // `()` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(|| ()).bench_values(|_: ()| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `()` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(|| ()).bench_values(|_: ()| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `()` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| ()).bench_values(|_: ()| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `i32` in, `()` out.
+        test(|b, f| b.with_inputs(|| 100i32).bench_values(|_: i32| -> () { f() }));
+
+        // `i32` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(|| 100i32).bench_values(|value: i32| -> i32 {
+                f();
+                value
+            })
+        });
+
+        // `i32` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(|| 100i32).bench_values(|_: i32| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `i32` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| 100i32).bench_values(|_: i32| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `String` in, `()` out.
+        test(|b, f| b.with_inputs(make_string).bench_values(|_: String| -> () { f() }));
+
+        // `String` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(make_string).bench_values(|_: String| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `String` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(make_string).bench_values(|value: String| -> String {
+                f();
+                value
+            })
+        });
+
+        // `String` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(make_string).bench_values(|_: String| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `DroppedZst` in, `()` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_values(|_: DroppedZst| -> () { f() })
+        });
+
+        // `DroppedZst` in, `i32` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_values(|_: DroppedZst| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `DroppedZst` in, `String` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_values(|_: DroppedZst| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `DroppedZst` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_values(|value: DroppedZst| -> DroppedZst {
+                f();
+                value
+            })
+        });
+    }
+
+    #[test]
+    fn bench_refs() {
+        struct DroppedZst;
+
+        static ZST_DROP_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        impl Drop for DroppedZst {
+            fn drop(&mut self) {
+                ZST_DROP_COUNT.fetch_add(1, SeqCst);
+            }
+        }
+
+        let test_zst_drop = |run_bench| {
+            ZST_DROP_COUNT.store(0, SeqCst);
+            test_with_drop_counter(&ZST_DROP_COUNT, run_bench);
+        };
+
+        // `&mut ()` in, `()` out.
+        test(|b, f| b.with_inputs(|| ()).bench_refs(|_: &mut ()| -> () { f() }));
+
+        // `&mut ()` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(|| ()).bench_refs(|_: &mut ()| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `&mut ()` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(|| ()).bench_refs(|_: &mut ()| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `&mut ()` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| ()).bench_refs(|_: &mut ()| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `&mut i32` in, `()` out.
+        test(|b, f| b.with_inputs(|| 100i32).bench_refs(|_: &mut i32| -> () { f() }));
+
+        // `&mut i32` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(|| 100i32).bench_refs(|value: &mut i32| -> i32 {
+                f();
+                *value
+            })
+        });
+
+        // `&mut i32` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(|| 100i32).bench_refs(|_: &mut i32| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `&mut i32` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| 100i32).bench_refs(|_: &mut i32| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `&mut String` in, `()` out.
+        test(|b, f| b.with_inputs(make_string).bench_refs(|_: &mut String| -> () { f() }));
+
+        // `&mut String` in, `i32` out.
+        test(|b, f| {
+            b.with_inputs(make_string).bench_refs(|_: &mut String| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `&mut String` in, `String` out.
+        test(|b, f| {
+            b.with_inputs(make_string).bench_refs(|value: &mut String| -> String {
+                f();
+                value.clone()
+            })
+        });
+
+        // `&mut String` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(make_string).bench_refs(|_: &mut String| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+
+        // `&mut DroppedZst` in, `()` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_refs(|_: &mut DroppedZst| -> () { f() })
+        });
+
+        // `&mut DroppedZst` in, `i32` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_refs(|_: &mut DroppedZst| -> i32 {
+                f();
+                100i32
+            })
+        });
+
+        // `&mut DroppedZst` in, `String` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| DroppedZst).bench_refs(|_: &mut DroppedZst| -> String {
+                f();
+                make_string()
+            })
+        });
+
+        // `&mut DroppedZst` in, `DroppedZst` out.
+        test_zst_drop(|b, f| {
+            b.with_inputs(|| {
+                // Adjust counter for input ZST.
+                ZST_DROP_COUNT.fetch_sub(1, SeqCst);
+
+                DroppedZst
+            })
+            .bench_refs(|_: &mut DroppedZst| -> DroppedZst {
+                f();
+                DroppedZst
+            })
+        });
+    }
+}
+
+mod no_input {
+    use super::*;
+
+    #[test]
+    fn string_output() {
+        test_bencher(&mut |b| b.bench(make_string));
+    }
+
+    #[test]
+    fn no_output() {
+        test_bencher(&mut |b| b.bench(|| black_box_drop(make_string())));
+    }
+}
+
+mod string_input {
+    use super::*;
+
+    #[test]
+    fn string_output() {
+        test_bencher(&mut |b| b.with_inputs(make_string).bench_values(|s| s.to_ascii_uppercase()));
+    }
+
+    #[test]
+    fn no_output() {
+        test_bencher(&mut |b| b.with_inputs(make_string).bench_refs(|s| s.make_ascii_uppercase()));
+    }
+}
+
+mod zst_input {
+    use super::*;
+
+    #[test]
+    fn zst_output() {
+        struct DroppedZst;
+
+        // Each test has its own `ZST_COUNT` global because tests are run
+        // independently in parallel.
+        static ZST_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        impl Drop for DroppedZst {
+            fn drop(&mut self) {
+                ZST_COUNT.fetch_sub(1, SeqCst);
+            }
+        }
+
+        test_bencher(&mut |b| {
+            b.with_inputs(|| {
+                ZST_COUNT.fetch_add(1, SeqCst);
+                DroppedZst
+            })
+            .bench_values(black_box);
+        });
+
+        assert_eq!(ZST_COUNT.load(SeqCst), 0);
+    }
+
+    #[test]
+    fn no_output() {
+        struct DroppedZst;
+
+        static ZST_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+        impl Drop for DroppedZst {
+            fn drop(&mut self) {
+                ZST_COUNT.fetch_sub(1, SeqCst);
+            }
+        }
+
+        test_bencher(&mut |b| {
+            b.with_inputs(|| {
+                ZST_COUNT.fetch_add(1, SeqCst);
+                DroppedZst
+            })
+            .bench_values(drop);
+        });
+
+        assert_eq!(ZST_COUNT.load(SeqCst), 0);
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/cli.rs b/crates/divan_compat/divan_fork/src/cli.rs
new file mode 100644
index 00000000..263e6eaf
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/cli.rs
@@ -0,0 +1,197 @@
+use clap::{builder::PossibleValue, value_parser, Arg, ArgAction, ColorChoice, Command, ValueEnum};
+
+use crate::{
+    config::{ParsedSeconds, SortingAttr},
+    counter::MaxCountUInt,
+    time::TimerKind,
+};
+
+pub(crate) fn command() -> Command {
+    fn option(name: &'static str) -> Arg {
+        Arg::new(name).long(name)
+    }
+
+    fn flag(name: &'static str) -> Arg {
+        option(name).action(ArgAction::SetTrue)
+    }
+
+    fn ignored_flag(name: &'static str) -> Arg {
+        flag(name).hide(true)
+    }
+
+    // Custom arguments not supported by libtest:
+    // - bytes-format
+    // - sample-count
+    // - sample-size
+    // - timer
+    // - sort
+    // - sortr
+
+    // TODO: `--format <pretty|terse>`
+
+    Command::new("divan")
+        .arg(
+            Arg::new("filter")
+                .value_name("FILTER")
+                .help("Only run benchmarks whose names match this pattern")
+                .action(ArgAction::Append),
+        )
+        .arg(
+            flag("test")
+                .help("Run benchmarks once to ensure they run successfully")
+                .conflicts_with("list"),
+        )
+        .arg(flag("list").help("Lists benchmarks").conflicts_with("test"))
+        .arg(
+            option("color")
+                .value_name("WHEN")
+                .help("Controls when to use colors")
+                .value_parser(value_parser!(ColorChoice))
+        )
+        .arg(
+            option("skip")
+                .value_name("FILTER")
+                .help("Skip benchmarks whose names match this pattern")
+                .action(ArgAction::Append),
+        )
+        .arg(flag("exact").help("Filter benchmarks by exact name rather than by pattern"))
+        .arg(flag("ignored").help("Run only ignored benchmarks").conflicts_with("include-ignored"))
+        .arg(
+            flag("include-ignored")
+                .help("Run ignored and not-ignored benchmarks")
+                .conflicts_with("ignored"),
+        )
+        .arg(
+            option("sort")
+                .env("DIVAN_SORT")
+                .value_name("ATTRIBUTE")
+                .help("Sort benchmarks in ascending order")
+                .value_parser(value_parser!(SortingAttr))
+        )
+        .arg(
+            option("sortr")
+                .env("DIVAN_SORTR")
+                .value_name("ATTRIBUTE")
+                .help("Sort benchmarks in descending order")
+                .value_parser(value_parser!(SortingAttr))
+                .overrides_with("sort"),
+        )
+        .arg(
+            option("timer")
+                .env("DIVAN_TIMER")
+                .value_name("os|tsc")
+                .help("Set the timer used for measuring samples")
+                .value_parser(value_parser!(TimerKind)),
+        )
+        .arg(
+            option("sample-count")
+                .env("DIVAN_SAMPLE_COUNT")
+                .value_name("N")
+                .help("Set the number of sampling iterations")
+                .value_parser(value_parser!(u32)),
+        )
+        .arg(
+            option("sample-size")
+                .env("DIVAN_SAMPLE_SIZE")
+                .value_name("N")
+                .help("Set the number of iterations inside a single sample")
+                .value_parser(value_parser!(u32)),
+        )
+        .arg(
+            option("threads")
+                .env("DIVAN_THREADS")
+                .value_name("N")
+                .value_delimiter(',')
+                .action(ArgAction::Append)
+                .help("Run across multiple threads to measure contention on atomics and locks")
+                .value_parser(value_parser!(usize)),
+        )
+        .arg(
+            option("min-time")
+                .env("DIVAN_MIN_TIME")
+                .value_name("SECS")
+                .help("Set the minimum seconds spent benchmarking a single function")
+                .value_parser(value_parser!(ParsedSeconds)),
+        )
+        .arg(
+            option("max-time")
+                .env("DIVAN_MAX_TIME")
+                .value_name("SECS")
+                .help("Set the maximum seconds spent benchmarking a single function, with priority over '--min-time'")
+                .value_parser(value_parser!(ParsedSeconds)),
+        )
+        .arg(
+            option("skip-ext-time")
+                .env("DIVAN_SKIP_EXT_TIME")
+                .value_name("true|false")
+                .help("When '--min-time' or '--max-time' is set, skip time external to benchmarked functions")
+                .value_parser(value_parser!(bool))
+                .num_args(0..=1),
+        )
+        .arg(
+            option("items-count")
+                .env("DIVAN_ITEMS_COUNT")
+                .value_name("N")
+                .help("Set every benchmark to have a throughput of N items")
+                .value_parser(value_parser!(MaxCountUInt)),
+        )
+        .arg(
+            option("bytes-count")
+                .env("DIVAN_BYTES_COUNT")
+                .value_name("N")
+                .help("Set every benchmark to have a throughput of N bytes")
+                .value_parser(value_parser!(MaxCountUInt)),
+        )
+        .arg(
+            option("bytes-format")
+                .env("DIVAN_BYTES_FORMAT")
+                .help("Set the numerical base for bytes in output")
+                .value_name("decimal|binary")
+                .value_parser(value_parser!(crate::counter::PrivBytesFormat))
+        )
+        .arg(
+            option("chars-count")
+                .env("DIVAN_CHARS_COUNT")
+                .value_name("N")
+                .help("Set every benchmark to have a throughput of N string scalars")
+                .value_parser(value_parser!(MaxCountUInt)),
+        )
+        .arg(
+            option("cycles-count")
+                .env("DIVAN_CYCLES_COUNT")
+                .value_name("N")
+                .help("Set every benchmark to have a throughput of N cycles, displayed as Hertz")
+                .value_parser(value_parser!(MaxCountUInt)),
+        )
+        // ignored:
+        .args([ignored_flag("bench"), ignored_flag("nocapture"), ignored_flag("show-output")])
+}
+
+impl ValueEnum for TimerKind {
+    fn value_variants<'a>() -> &'a [Self] {
+        &[Self::Os, Self::Tsc]
+    }
+
+    fn to_possible_value(&self) -> Option<PossibleValue> {
+        let name = match self {
+            Self::Os => "os",
+            Self::Tsc => "tsc",
+        };
+        Some(PossibleValue::new(name))
+    }
+}
+
+impl ValueEnum for SortingAttr {
+    fn value_variants<'a>() -> &'a [Self] {
+        &[Self::Kind, Self::Name, Self::Location]
+    }
+
+    fn to_possible_value(&self) -> Option<PossibleValue> {
+        let name = match self {
+            Self::Kind => "kind",
+            Self::Name => "name",
+            Self::Location => "location",
+        };
+        Some(PossibleValue::new(name))
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/compile_fail.rs b/crates/divan_compat/divan_fork/src/compile_fail.rs
new file mode 100644
index 00000000..8a86b7fe
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/compile_fail.rs
@@ -0,0 +1,36 @@
+//! Private compile failure tests.
+//!
+//! # Repeated Options
+//!
+//! Options repeated in `#[divan::bench]` should cause a compile error, even if
+//! they use raw identifiers. The initial implementation allowed raw identifiers
+//! to slip through because `syn::Ident` does not consider them to be equal to
+//! the normal form without the `r#` prefix.
+//!
+//! We don't include `r#crate` here because it's not a valid identifier.
+//!
+//! ```compile_fail
+//! #[divan::bench(name = "x", r#name = "x")]
+//! fn bench() {}
+//! ```
+//!
+//! ```compile_fail
+//! #[divan::bench(sample_count = 1, r#sample_count = 1)]
+//! fn bench() {}
+//! ```
+//!
+//! ```compile_fail
+//! #[divan::bench(sample_size = 1, r#sample_size = 1)]
+//! fn bench() {}
+//! ```
+//!
+//! # Type Checking
+//!
+//! The following won't produce any benchmarks because `types = []`. However, we
+//! still want to ensure that values in `consts = [...]` match the generic
+//! const's type of `i32`.
+//!
+//! ```compile_fail
+//! #[divan::bench(types = [], consts = ['a', 'b', 'c'])]
+//! fn bench<T, const C: i32>() {}
+//! ```
diff --git a/crates/divan_compat/divan_fork/src/config.rs b/crates/divan_compat/divan_fork/src/config.rs
new file mode 100644
index 00000000..1c0daae0
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/config.rs
@@ -0,0 +1,186 @@
+use std::{cmp::Ordering, error::Error, str::FromStr, time::Duration};
+
+use regex::Regex;
+
+use crate::util::sort::natural_cmp;
+
+/// `Duration` wrapper for parsing seconds from the CLI.
+#[derive(Clone, Copy)]
+pub(crate) struct ParsedSeconds(pub Duration);
+
+impl FromStr for ParsedSeconds {
+    type Err = Box<dyn Error + Send + Sync>;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(Self(Duration::try_from_secs_f64(f64::from_str(s)?)?))
+    }
+}
+
+/// The primary action to perform.
+#[derive(Clone, Copy, Default)]
+pub(crate) enum Action {
+    /// Run benchmark loops.
+    #[default]
+    Bench,
+
+    /// Run benchmarked functions once to ensure they run successfully.
+    Test,
+
+    /// List benchmarks.
+    List,
+}
+
+#[allow(dead_code)]
+impl Action {
+    #[inline]
+    pub fn is_bench(&self) -> bool {
+        matches!(self, Self::Bench)
+    }
+
+    #[inline]
+    pub fn is_test(&self) -> bool {
+        matches!(self, Self::Test)
+    }
+
+    #[inline]
+    pub fn is_list(&self) -> bool {
+        matches!(self, Self::List)
+    }
+}
+
+/// Filters which benchmark to run based on name.
+pub(crate) enum Filter {
+    Regex(Regex),
+    Exact(String),
+}
+
+impl Filter {
+    /// Returns `true` if a string matches this filter.
+    pub fn is_match(&self, s: &str) -> bool {
+        match self {
+            Self::Regex(r) => r.is_match(s),
+            Self::Exact(e) => e == s,
+        }
+    }
+}
+
+/// How to treat benchmarks based on whether they're marked as `#[ignore]`.
+#[derive(Copy, Clone, Default)]
+pub(crate) enum RunIgnored {
+    /// Skip ignored.
+    #[default]
+    No,
+
+    /// `--include-ignored`.
+    Yes,
+
+    /// `--ignored`.
+    Only,
+}
+
+impl RunIgnored {
+    pub fn run_ignored(self) -> bool {
+        matches!(self, Self::Yes | Self::Only)
+    }
+
+    pub fn run_non_ignored(self) -> bool {
+        matches!(self, Self::Yes | Self::No)
+    }
+
+    pub fn should_run(self, ignored: bool) -> bool {
+        if ignored {
+            self.run_ignored()
+        } else {
+            self.run_non_ignored()
+        }
+    }
+}
+
+/// The attribute to sort benchmarks by.
+#[derive(Clone, Copy, Default)]
+pub(crate) enum SortingAttr {
+    /// Sort by kind, then by name and location.
+    #[default]
+    Kind,
+
+    /// Sort by name, then by location and kind.
+    Name,
+
+    /// Sort by location, then by kind and name.
+    Location,
+}
+
+impl SortingAttr {
+    /// Returns an array containing `self` along with other attributes that
+    /// should break ties if attributes are equal.
+    pub fn with_tie_breakers(self) -> [Self; 3] {
+        use SortingAttr::*;
+
+        match self {
+            Kind => [self, Name, Location],
+            Name => [self, Location, Kind],
+            Location => [self, Kind, Name],
+        }
+    }
+
+    /// Compares benchmark runtime argument names.
+    ///
+    /// This takes `&&str` to handle `SortingAttr::Location` since the strings
+    /// are considered to be within the same `&[&str]`.
+    pub fn cmp_bench_arg_names(self, a: &&str, b: &&str) -> Ordering {
+        for attr in self.with_tie_breakers() {
+            let ordering = match attr {
+                SortingAttr::Kind => Ordering::Equal,
+
+                SortingAttr::Name => 'ordering: {
+                    // Compare as integers.
+                    match (a.parse::<u128>(), a.parse::<u128>()) {
+                        (Ok(a_u128), Ok(b_u128)) => break 'ordering a_u128.cmp(&b_u128),
+
+                        (Ok(_), Err(_)) => {
+                            if b.parse::<i128>().is_ok() {
+                                // a > b, because b is negative.
+                                break 'ordering Ordering::Greater;
+                            }
+                        }
+
+                        (Err(_), Ok(_)) => {
+                            if a.parse::<i128>().is_ok() {
+                                // a < b, because a is negative.
+                                break 'ordering Ordering::Less;
+                            }
+                        }
+
+                        (Err(_), Err(_)) => {
+                            if let (Ok(a_i128), Ok(b_i128)) = (a.parse::<i128>(), a.parse::<i128>())
+                            {
+                                break 'ordering a_i128.cmp(&b_i128);
+                            }
+                        }
+                    }
+
+                    // Compare as floats.
+                    if let (Ok(a), Ok(b)) = (a.parse::<f64>(), b.parse::<f64>()) {
+                        if let Some(ordering) = a.partial_cmp(&b) {
+                            break 'ordering ordering;
+                        }
+                    }
+
+                    natural_cmp(a, b)
+                }
+
+                SortingAttr::Location => {
+                    let a: *const &str = a;
+                    let b: *const &str = b;
+                    a.cmp(&b)
+                }
+            };
+
+            if ordering != Ordering::Equal {
+                return ordering;
+            }
+        }
+
+        Ordering::Equal
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/counter/any_counter.rs b/crates/divan_compat/divan_fork/src/counter/any_counter.rs
new file mode 100644
index 00000000..57f60dce
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/any_counter.rs
@@ -0,0 +1,233 @@
+use std::any::TypeId;
+
+use crate::{
+    counter::{
+        BytesCount, BytesFormat, CharsCount, CyclesCount, IntoCounter, ItemsCount, MaxCountUInt,
+    },
+    time::FineDuration,
+    util::{fmt::DisplayThroughput, ty::TypeCast},
+};
+
+/// Type-erased `Counter`.
+///
+/// This does not implement `Copy` because in the future it will contain
+/// user-defined counters.
+#[derive(Clone)]
+pub(crate) struct AnyCounter {
+    pub kind: KnownCounterKind,
+    count: MaxCountUInt,
+}
+
+impl AnyCounter {
+    #[inline]
+    pub(crate) fn new<C: IntoCounter>(counter: C) -> Self {
+        let counter = counter.into_counter();
+
+        if let Some(bytes) = counter.cast_ref::<BytesCount>() {
+            Self::bytes(bytes.count)
+        } else if let Some(chars) = counter.cast_ref::<CharsCount>() {
+            Self::chars(chars.count)
+        } else if let Some(cycles) = counter.cast_ref::<CyclesCount>() {
+            Self::cycles(cycles.count)
+        } else if let Some(items) = counter.cast_ref::<ItemsCount>() {
+            Self::items(items.count)
+        } else {
+            unreachable!()
+        }
+    }
+
+    #[inline]
+    pub(crate) fn known(kind: KnownCounterKind, count: MaxCountUInt) -> Self {
+        Self { kind, count }
+    }
+
+    #[inline]
+    pub(crate) fn bytes(count: MaxCountUInt) -> Self {
+        Self::known(KnownCounterKind::Bytes, count)
+    }
+
+    #[inline]
+    pub(crate) fn chars(count: MaxCountUInt) -> Self {
+        Self::known(KnownCounterKind::Chars, count)
+    }
+
+    #[inline]
+    pub(crate) fn cycles(count: MaxCountUInt) -> Self {
+        Self::known(KnownCounterKind::Cycles, count)
+    }
+
+    #[inline]
+    pub(crate) fn items(count: MaxCountUInt) -> Self {
+        Self::known(KnownCounterKind::Items, count)
+    }
+
+    pub(crate) fn display_throughput(
+        &self,
+        duration: FineDuration,
+        bytes_format: BytesFormat,
+    ) -> DisplayThroughput {
+        DisplayThroughput { counter: self, picos: duration.picos as f64, bytes_format }
+    }
+
+    #[inline]
+    pub(crate) fn count(&self) -> MaxCountUInt {
+        self.count
+    }
+
+    #[inline]
+    pub(crate) fn known_kind(&self) -> KnownCounterKind {
+        self.kind
+    }
+}
+
+/// Kind of `Counter` defined by this crate.
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum KnownCounterKind {
+    Bytes,
+    Chars,
+    Cycles,
+    Items,
+}
+
+impl KnownCounterKind {
+    pub const COUNT: usize = 4;
+
+    pub const ALL: [Self; Self::COUNT] = [Self::Bytes, Self::Chars, Self::Cycles, Self::Items];
+
+    /// The maximum width for columns displaying counters.
+    pub const MAX_COMMON_COLUMN_WIDTH: usize = "1.111 Kitem/s".len();
+
+    #[inline]
+    pub fn of<C: IntoCounter>() -> Self {
+        let id = TypeId::of::<C::Counter>();
+        if id == TypeId::of::<BytesCount>() {
+            Self::Bytes
+        } else if id == TypeId::of::<CharsCount>() {
+            Self::Chars
+        } else if id == TypeId::of::<CyclesCount>() {
+            Self::Cycles
+        } else if id == TypeId::of::<ItemsCount>() {
+            Self::Items
+        } else {
+            unreachable!()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn known_counter_kind() {
+        macro_rules! test {
+            ($t:ident, $k:ident) => {
+                assert_eq!(KnownCounterKind::of::<$t>(), KnownCounterKind::$k);
+            };
+        }
+
+        test!(BytesCount, Bytes);
+        test!(CharsCount, Chars);
+        test!(CyclesCount, Cycles);
+        test!(ItemsCount, Items);
+    }
+
+    mod display_throughput {
+        use super::*;
+
+        #[test]
+        fn bytes() {
+            #[track_caller]
+            fn test(
+                bytes: MaxCountUInt,
+                picos: u128,
+                expected_binary: &str,
+                expected_decimal: &str,
+            ) {
+                for (bytes_format, expected) in [
+                    (BytesFormat::Binary, expected_binary),
+                    (BytesFormat::Decimal, expected_decimal),
+                ] {
+                    assert_eq!(
+                        AnyCounter::bytes(bytes)
+                            .display_throughput(FineDuration { picos }, bytes_format)
+                            .to_string(),
+                        expected
+                    );
+                }
+            }
+
+            #[track_caller]
+            fn test_all(bytes: MaxCountUInt, picos: u128, expected: &str) {
+                test(bytes, picos, expected, expected);
+            }
+
+            test_all(1, 0, "inf B/s");
+            test_all(MaxCountUInt::MAX, 0, "inf B/s");
+
+            test_all(0, 0, "0 B/s");
+            test_all(0, 1, "0 B/s");
+            test_all(0, u128::MAX, "0 B/s");
+        }
+
+        #[test]
+        fn chars() {
+            #[track_caller]
+            fn test(chars: MaxCountUInt, picos: u128, expected: &str) {
+                assert_eq!(
+                    AnyCounter::chars(chars)
+                        .display_throughput(FineDuration { picos }, BytesFormat::default())
+                        .to_string(),
+                    expected
+                );
+            }
+
+            test(1, 0, "inf char/s");
+            test(MaxCountUInt::MAX, 0, "inf char/s");
+
+            test(0, 0, "0 char/s");
+            test(0, 1, "0 char/s");
+            test(0, u128::MAX, "0 char/s");
+        }
+
+        #[test]
+        fn cycles() {
+            #[track_caller]
+            fn test(cycles: MaxCountUInt, picos: u128, expected: &str) {
+                assert_eq!(
+                    AnyCounter::cycles(cycles)
+                        .display_throughput(FineDuration { picos }, BytesFormat::default())
+                        .to_string(),
+                    expected
+                );
+            }
+
+            test(1, 0, "inf Hz");
+            test(MaxCountUInt::MAX, 0, "inf Hz");
+
+            test(0, 0, "0 Hz");
+            test(0, 1, "0 Hz");
+            test(0, u128::MAX, "0 Hz");
+        }
+
+        #[test]
+        fn items() {
+            #[track_caller]
+            fn test(items: MaxCountUInt, picos: u128, expected: &str) {
+                assert_eq!(
+                    AnyCounter::items(items)
+                        .display_throughput(FineDuration { picos }, BytesFormat::default())
+                        .to_string(),
+                    expected
+                );
+            }
+
+            test(1, 0, "inf item/s");
+            test(MaxCountUInt::MAX, 0, "inf item/s");
+
+            test(0, 0, "0 item/s");
+            test(0, 1, "0 item/s");
+            test(0, u128::MAX, "0 item/s");
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/counter/collection.rs b/crates/divan_compat/divan_fork/src/counter/collection.rs
new file mode 100644
index 00000000..73dcd4bc
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/collection.rs
@@ -0,0 +1,146 @@
+use crate::counter::{AnyCounter, IntoCounter, KnownCounterKind, MaxCountUInt};
+
+/// Multi-map from counters to their counts and input-based initializer.
+#[derive(Default)]
+pub(crate) struct CounterCollection {
+    info: [KnownCounterInfo; KnownCounterKind::COUNT],
+}
+
+#[derive(Default)]
+struct KnownCounterInfo {
+    // TODO: Inlinable vector.
+    counts: Vec<MaxCountUInt>,
+
+    /// `BencherConfig::with_inputs` can only be called once, so the input type
+    /// cannot change.
+    count_input: Option<Box</* unsafe */ dyn Fn(*const ()) -> MaxCountUInt + Sync>>,
+}
+
+impl CounterCollection {
+    #[inline]
+    fn info(&self, counter_kind: KnownCounterKind) -> &KnownCounterInfo {
+        &self.info[counter_kind as usize]
+    }
+
+    #[inline]
+    fn info_mut(&mut self, counter_kind: KnownCounterKind) -> &mut KnownCounterInfo {
+        &mut self.info[counter_kind as usize]
+    }
+
+    #[inline]
+    pub(crate) fn counts(&self, counter_kind: KnownCounterKind) -> &[MaxCountUInt] {
+        &self.info(counter_kind).counts
+    }
+
+    pub(crate) fn mean_count(&self, counter_kind: KnownCounterKind) -> MaxCountUInt {
+        let counts = self.counts(counter_kind);
+
+        let sum: u128 = counts.iter().map(|&c| c as u128).sum();
+
+        (sum / counts.len() as u128) as MaxCountUInt
+    }
+
+    #[inline]
+    pub(crate) fn uses_input_counts(&self, counter_kind: KnownCounterKind) -> bool {
+        self.info(counter_kind).count_input.is_some()
+    }
+
+    pub(crate) fn set_counter(&mut self, counter: AnyCounter) {
+        let new_count = counter.count();
+        let info = self.info_mut(counter.known_kind());
+
+        if let Some(old_count) = info.counts.first_mut() {
+            *old_count = new_count;
+        } else {
+            info.counts.push(new_count);
+        }
+    }
+
+    pub(crate) fn push_counter(&mut self, counter: AnyCounter) {
+        self.info_mut(counter.known_kind()).counts.push(counter.count());
+    }
+
+    /// Set the input-based count generator function for a counter.
+    pub(crate) fn set_input_counter<I, C, F>(&mut self, make_counter: F)
+    where
+        F: Fn(&I) -> C + Sync + 'static,
+        C: IntoCounter,
+    {
+        let info = self.info_mut(KnownCounterKind::of::<C::Counter>());
+
+        // Ignore previously-set counts.
+        info.counts.clear();
+
+        info.count_input = Some(Box::new(move |input: *const ()| {
+            // SAFETY: Callers to `get_input_count` guarantee that the same `&I`
+            // is passed.
+            let counter = unsafe { make_counter(&*input.cast::<I>()) };
+
+            AnyCounter::new(counter).count()
+        }));
+    }
+
+    /// Calls the user-provided closure to get the counter count for a given
+    /// input.
+    ///
+    /// # Safety
+    ///
+    /// The `I` type must be the same as that used by `set_input_counter`.
+    pub(crate) unsafe fn get_input_count<I>(
+        &self,
+        counter_kind: KnownCounterKind,
+        input: &I,
+    ) -> Option<MaxCountUInt> {
+        let from_input = self.info(counter_kind).count_input.as_ref()?;
+
+        // SAFETY: The caller ensures that this is called on the same input type
+        // used for calling `set_input_counter`.
+        Some(unsafe { from_input(input as *const I as *const ()) })
+    }
+
+    /// Removes counts that came from input.
+    pub(crate) fn clear_input_counts(&mut self) {
+        for info in &mut self.info {
+            if info.count_input.is_some() {
+                info.counts.clear();
+            }
+        }
+    }
+}
+
+/// A set of known and (future) custom counters.
+#[derive(Clone, Debug, Default)]
+pub struct CounterSet {
+    counts: [Option<MaxCountUInt>; KnownCounterKind::COUNT],
+}
+
+impl CounterSet {
+    pub fn with(mut self, counter: impl IntoCounter) -> Self {
+        self.insert(counter);
+        self
+    }
+
+    pub fn insert(&mut self, counter: impl IntoCounter) -> &mut Self {
+        let counter = AnyCounter::new(counter);
+        self.counts[counter.known_kind() as usize] = Some(counter.count());
+        self
+    }
+
+    pub(crate) fn get(&self, counter_kind: KnownCounterKind) -> Option<MaxCountUInt> {
+        self.counts[counter_kind as usize]
+    }
+
+    /// Overwrites `other` with values set in `self`.
+    pub(crate) fn overwrite(&self, other: &Self) -> Self {
+        Self { counts: KnownCounterKind::ALL.map(|kind| self.get(kind).or(other.get(kind))) }
+    }
+
+    pub(crate) fn to_collection(&self) -> CounterCollection {
+        CounterCollection {
+            info: KnownCounterKind::ALL.map(|kind| KnownCounterInfo {
+                counts: self.get(kind).into_iter().collect(),
+                count_input: None,
+            }),
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/counter/into_counter.rs b/crates/divan_compat/divan_fork/src/counter/into_counter.rs
new file mode 100644
index 00000000..45a09da9
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/into_counter.rs
@@ -0,0 +1,38 @@
+use crate::counter::Counter;
+
+/// Conversion into a [`Counter`].
+///
+/// # Examples
+///
+/// This trait is implemented for unsigned integers over
+/// [`ItemsCount`](crate::counter::ItemsCount):
+///
+/// ```
+/// #[divan::bench]
+/// fn sort_values(bencher: divan::Bencher) {
+///     # type T = String;
+///     let mut values: Vec<T> = // ...
+///     # Vec::new();
+///     bencher
+///         .counter(values.len())
+///         .bench_local(|| {
+///             divan::black_box(&mut values).sort();
+///         });
+/// }
+/// ```
+pub trait IntoCounter {
+    /// Which kind of counter are we turning this into?
+    type Counter: Counter;
+
+    /// Converts into a [`Counter`].
+    fn into_counter(self) -> Self::Counter;
+}
+
+impl<C: Counter> IntoCounter for C {
+    type Counter = C;
+
+    #[inline]
+    fn into_counter(self) -> Self::Counter {
+        self
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/counter/mod.rs b/crates/divan_compat/divan_fork/src/counter/mod.rs
new file mode 100644
index 00000000..900c9e27
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/mod.rs
@@ -0,0 +1,303 @@
+//! Count values processed in each iteration to measure throughput.
+//!
+//! # Examples
+//!
+//! The following example measures throughput of converting
+//! [`&[i32]`](prim@slice) into [`Vec<i32>`](Vec) by providing [`BytesCount`]
+//! via [`Bencher::counter`](crate::Bencher::counter):
+//!
+//! ```
+//! use divan::counter::BytesCount;
+//!
+//! #[divan::bench]
+//! fn slice_into_vec(bencher: divan::Bencher) {
+//!     let ints: &[i32] = &[
+//!         // ...
+//!     ];
+//!
+//!     let bytes = BytesCount::of_slice(ints);
+//!
+//!     bencher
+//!         .counter(bytes)
+//!         .bench(|| -> Vec<i32> {
+//!             divan::black_box(ints).into()
+//!         });
+//! }
+//! ```
+
+use std::any::Any;
+
+mod any_counter;
+mod collection;
+mod into_counter;
+mod sealed;
+mod uint;
+
+pub(crate) use self::{
+    any_counter::{AnyCounter, KnownCounterKind},
+    collection::{CounterCollection, CounterSet},
+    sealed::Sealed,
+    uint::{AsCountUInt, CountUInt, MaxCountUInt},
+};
+pub use into_counter::IntoCounter;
+
+/// Counts the number of values processed in each iteration of a benchmarked
+/// function.
+///
+/// This is used via:
+/// - [`#[divan::bench(counters = ...)]`](macro@crate::bench#counters)
+/// - [`#[divan::bench_group(counters = ...)]`](macro@crate::bench_group#counters)
+/// - [`Bencher::counter`](crate::Bencher::counter)
+/// - [`Bencher::input_counter`](crate::Bencher::input_counter)
+#[doc(alias = "throughput")]
+pub trait Counter: Sized + Any + Sealed {}
+
+/// Process N bytes.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct BytesCount {
+    count: MaxCountUInt,
+}
+
+/// Process N [`char`s](char).
+///
+/// This is beneficial when comparing benchmarks between ASCII and Unicode
+/// implementations, since the number of code points is a common baseline
+/// reference.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct CharsCount {
+    count: MaxCountUInt,
+}
+
+/// Process N cycles, displayed as Hertz.
+///
+/// This value is user-provided and does not necessarily correspond to the CPU's
+/// cycle frequency, so it may represent cycles of anything appropriate for the
+/// benchmarking context.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct CyclesCount {
+    count: MaxCountUInt,
+}
+
+/// Process N items.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct ItemsCount {
+    count: MaxCountUInt,
+}
+
+impl Sealed for BytesCount {}
+impl Sealed for CharsCount {}
+impl Sealed for CyclesCount {}
+impl Sealed for ItemsCount {}
+
+impl Counter for BytesCount {}
+impl Counter for CharsCount {}
+impl Counter for CyclesCount {}
+impl Counter for ItemsCount {}
+
+impl<C: AsCountUInt> From<C> for BytesCount {
+    #[inline]
+    fn from(count: C) -> Self {
+        Self::new(count.as_max_uint())
+    }
+}
+
+impl<C: AsCountUInt> From<C> for CharsCount {
+    #[inline]
+    fn from(count: C) -> Self {
+        Self::new(count.as_max_uint())
+    }
+}
+
+impl<C: AsCountUInt> From<C> for CyclesCount {
+    #[inline]
+    fn from(count: C) -> Self {
+        Self::new(count.as_max_uint())
+    }
+}
+
+impl<C: AsCountUInt> From<C> for ItemsCount {
+    #[inline]
+    fn from(count: C) -> Self {
+        Self::new(count.as_max_uint())
+    }
+}
+
+impl BytesCount {
+    /// Count N bytes.
+    #[inline]
+    pub fn new<N: CountUInt>(count: N) -> Self {
+        Self { count: count.into_max_uint() }
+    }
+
+    /// Counts the size of a type with [`size_of`].
+    #[inline]
+    #[doc(alias = "size_of")]
+    pub const fn of<T>() -> Self {
+        Self { count: size_of::<T>() as MaxCountUInt }
+    }
+
+    /// Counts the size of multiple instances of a type with [`size_of`].
+    #[inline]
+    #[doc(alias = "size_of")]
+    pub const fn of_many<T>(n: usize) -> Self {
+        match (size_of::<T>() as MaxCountUInt).checked_mul(n as MaxCountUInt) {
+            Some(count) => Self { count },
+            None => panic!("overflow"),
+        }
+    }
+
+    /// Counts the size of a value with [`size_of_val`].
+    #[inline]
+    #[doc(alias = "size_of_val")]
+    pub fn of_val<T: ?Sized>(val: &T) -> Self {
+        // TODO: Make const, https://github.com/rust-lang/rust/issues/46571
+        Self { count: size_of_val(val) as MaxCountUInt }
+    }
+
+    /// Counts the bytes of [`Iterator::Item`s](Iterator::Item).
+    #[inline]
+    pub fn of_iter<T, I>(iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        Self::of_many::<T>(iter.into_iter().count())
+    }
+
+    /// Counts the bytes of a [`&str`].
+    ///
+    /// This is like [`BytesCount::of_val`] with the convenience of behaving as
+    /// expected for [`&String`](String) and other types that convert to
+    /// [`&str`].
+    ///
+    /// [`&str`]: prim@str
+    #[inline]
+    pub fn of_str<S: ?Sized + AsRef<str>>(s: &S) -> Self {
+        Self::of_val(s.as_ref())
+    }
+
+    /// Counts the bytes of a [slice](prim@slice).
+    ///
+    /// This is like [`BytesCount::of_val`] with the convenience of behaving as
+    /// expected for [`&Vec<T>`](Vec) and other types that convert to
+    /// [`&[T]`](prim@slice).
+    #[inline]
+    pub fn of_slice<T, S: ?Sized + AsRef<[T]>>(s: &S) -> Self {
+        Self::of_val(s.as_ref())
+    }
+}
+
+macro_rules! type_bytes {
+    ($ty:ident) => {
+        /// Counts the bytes of multiple
+        #[doc = concat!("[`", stringify!($ty), "`s](", stringify!($ty), ").")]
+        #[inline]
+        pub const fn $ty(n: usize) -> Self {
+            Self::of_many::<$ty>(n)
+        }
+    };
+}
+
+/// Count bytes of multiple values.
+impl BytesCount {
+    type_bytes!(f32);
+    type_bytes!(f64);
+
+    type_bytes!(i8);
+    type_bytes!(u8);
+    type_bytes!(i16);
+    type_bytes!(u16);
+    type_bytes!(i32);
+    type_bytes!(u32);
+    type_bytes!(i64);
+    type_bytes!(u64);
+    type_bytes!(i128);
+    type_bytes!(u128);
+    type_bytes!(isize);
+    type_bytes!(usize);
+}
+
+impl CharsCount {
+    /// Count N [`char`s](char).
+    #[inline]
+    pub fn new<N: CountUInt>(count: N) -> Self {
+        Self { count: count.into_max_uint() }
+    }
+
+    /// Counts the [`char`s](prim@char) of a [`&str`](prim@str).
+    #[inline]
+    pub fn of_str<S: ?Sized + AsRef<str>>(s: &S) -> Self {
+        Self::new(s.as_ref().chars().count())
+    }
+}
+
+impl CyclesCount {
+    /// Count N cycles.
+    #[inline]
+    pub fn new<N: CountUInt>(count: N) -> Self {
+        Self { count: count.into_max_uint() }
+    }
+}
+
+impl ItemsCount {
+    /// Count N items.
+    #[inline]
+    pub fn new<N: CountUInt>(count: N) -> Self {
+        Self { count: count.into_max_uint() }
+    }
+
+    /// Counts [`Iterator::Item`s](Iterator::Item).
+    #[inline]
+    pub fn of_iter<T, I>(iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        Self::new(iter.into_iter().count())
+    }
+}
+
+/// The numerical base for [`BytesCount`] in benchmark outputs.
+///
+/// See [`Divan::bytes_format`](crate::Divan::bytes_format) for more info.
+#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
+#[non_exhaustive]
+pub enum BytesFormat {
+    /// Powers of 1000, starting with KB (kilobyte). This is the default.
+    #[default]
+    Decimal,
+
+    /// Powers of 1024, starting with KiB (kibibyte).
+    Binary,
+}
+
+/// Private `BytesFormat` that prevents leaking trait implementations we don't
+/// want to publicly commit to.
+#[derive(Clone, Copy)]
+pub(crate) struct PrivBytesFormat(pub BytesFormat);
+
+impl clap::ValueEnum for PrivBytesFormat {
+    fn value_variants<'a>() -> &'a [Self] {
+        &[Self(BytesFormat::Decimal), Self(BytesFormat::Binary)]
+    }
+
+    fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
+        let name = match self.0 {
+            BytesFormat::Decimal => "decimal",
+            BytesFormat::Binary => "binary",
+        };
+        Some(clap::builder::PossibleValue::new(name))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    mod bytes_count {
+        use super::*;
+
+        #[test]
+        fn of_iter() {
+            assert_eq!(BytesCount::of_iter::<i32, _>([1, 2, 3]), BytesCount::of_slice(&[1, 2, 3]));
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/counter/sealed.rs b/crates/divan_compat/divan_fork/src/counter/sealed.rs
new file mode 100644
index 00000000..fe4cc31f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/sealed.rs
@@ -0,0 +1,5 @@
+/// Prevents `Counter` from being implemented externally.
+///
+/// Items exist on this trait rather than `Counter` so that they are impossible
+/// to access externally.
+pub trait Sealed {}
diff --git a/crates/divan_compat/divan_fork/src/counter/uint.rs b/crates/divan_compat/divan_fork/src/counter/uint.rs
new file mode 100644
index 00000000..2c5770d2
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/counter/uint.rs
@@ -0,0 +1,62 @@
+use std::any::Any;
+
+use crate::counter::{IntoCounter, ItemsCount};
+
+/// The largest unsigned integer usable by counters provided by this crate.
+///
+/// If `usize > u64`, this is a type alias to `usize`. Otherwise, it is a type
+/// alias to `u64`.
+pub type MaxCountUInt = condtype::num::Usize64;
+
+/// `u8`-`u64` and `usize`.
+///
+/// We deliberately do not implement this trait for `u128` to make it
+/// impossible† to overflow `u128` when summing counts for averaging.
+///
+/// †When `usize` is larger than `u64`, it becomes possible to overflow `u128`.
+/// In this case, Divan assumes
+pub trait CountUInt: Copy + Any {
+    fn into_max_uint(self) -> MaxCountUInt;
+}
+
+/// A type like `CountUInt` but with more options.
+pub trait AsCountUInt {
+    fn as_max_uint(&self) -> MaxCountUInt;
+}
+
+impl<T: AsCountUInt> AsCountUInt for &T {
+    #[inline]
+    fn as_max_uint(&self) -> MaxCountUInt {
+        T::as_max_uint(self)
+    }
+}
+
+macro_rules! impl_uint {
+    ($($i:ty),+) => {
+        $(impl CountUInt for $i {
+            #[inline]
+            fn into_max_uint(self) -> MaxCountUInt {
+                self as _
+            }
+        })+
+
+        $(impl AsCountUInt for $i {
+            #[inline]
+            fn as_max_uint(&self) -> MaxCountUInt {
+                *self as _
+            }
+        })+
+
+        $(impl IntoCounter for $i {
+            type Counter = ItemsCount;
+
+            #[inline]
+            fn into_counter(self) -> ItemsCount {
+                ItemsCount::new(self)
+            }
+        })+
+    };
+}
+
+// These types must be losslessly convertible to `MaxCountUInt`.
+impl_uint!(u8, u16, u32, u64, usize);
diff --git a/crates/divan_compat/divan_fork/src/divan.rs b/crates/divan_compat/divan_fork/src/divan.rs
new file mode 100644
index 00000000..c6007b20
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/divan.rs
@@ -0,0 +1,768 @@
+#![allow(clippy::too_many_arguments)]
+
+use std::{borrow::Cow, cell::RefCell, fmt, num::NonZeroUsize, time::Duration};
+
+use clap::ColorChoice;
+use regex::Regex;
+
+use crate::{
+    bench::BenchOptions,
+    config::{Action, Filter, ParsedSeconds, RunIgnored, SortingAttr},
+    counter::{
+        BytesCount, BytesFormat, CharsCount, CyclesCount, IntoCounter, ItemsCount, MaxCountUInt,
+        PrivBytesFormat,
+    },
+    entry::{AnyBenchEntry, BenchEntryRunner, EntryTree},
+    thread_pool::BENCH_POOL,
+    time::{Timer, TimerKind},
+    tree_painter::{TreeColumn, TreePainter},
+    util::{self, defer},
+    Bencher,
+};
+
+/// The benchmark runner.
+#[derive(Default)]
+pub struct Divan {
+    action: Action,
+    timer: TimerKind,
+    reverse_sort: bool,
+    sorting_attr: SortingAttr,
+    color: ColorChoice,
+    bytes_format: BytesFormat,
+    filters: Vec<Filter>,
+    skip_filters: Vec<Filter>,
+    run_ignored: RunIgnored,
+    bench_options: BenchOptions<'static>,
+}
+
+/// Immutable context shared between entry runs.
+pub(crate) struct SharedContext {
+    /// The specific action being performed.
+    pub action: Action,
+
+    /// The timer used to measure samples.
+    pub timer: Timer,
+}
+
+impl fmt::Debug for Divan {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Divan").finish_non_exhaustive()
+    }
+}
+
+impl Divan {
+    /// Perform the configured action.
+    ///
+    /// By default, this will be [`Divan::run_benches`].
+    pub fn main(&self) {
+        self.run_action(self.action);
+    }
+
+    /// Benchmark registered functions.
+    pub fn run_benches(&self) {
+        self.run_action(Action::Bench);
+    }
+
+    /// Test registered functions as if the `--test` flag was used.
+    ///
+    /// Unlike [`Divan::run_benches`], this runs each benchmarked function only
+    /// once.
+    pub fn test_benches(&self) {
+        self.run_action(Action::Test);
+    }
+
+    /// Print registered functions as if the `--list` flag was used.
+    pub fn list_benches(&self) {
+        self.run_action(Action::Test);
+    }
+
+    /// Returns `true` if an entry at the given path should be considered for
+    /// running.
+    ///
+    /// This does not take into account `entry.ignored` because that is handled
+    /// separately.
+    fn filter(&self, entry_path: &str) -> bool {
+        if !self.filters.is_empty()
+            && !self.filters.iter().any(|filter| filter.is_match(entry_path))
+        {
+            return false;
+        }
+
+        !self.skip_filters.iter().any(|filter| filter.is_match(entry_path))
+    }
+
+    pub(crate) fn should_ignore(&self, ignored: bool) -> bool {
+        !self.run_ignored.should_run(ignored)
+    }
+
+    pub(crate) fn run_action(&self, action: Action) {
+        let _drop_threads = defer(|| BENCH_POOL.drop_threads());
+
+        let mut tree: Vec<EntryTree> = if cfg!(miri) {
+            // Miri does not work with our linker tricks.
+            Vec::new()
+        } else {
+            let group_entries = &crate::entry::GROUP_ENTRIES;
+
+            let generic_bench_entries = group_entries
+                .iter()
+                .flat_map(|group| group.generic_benches_iter().map(AnyBenchEntry::GenericBench));
+
+            let bench_entries = crate::entry::BENCH_ENTRIES
+                .iter()
+                .map(AnyBenchEntry::Bench)
+                .chain(generic_bench_entries);
+
+            let mut tree = EntryTree::from_benches(bench_entries);
+
+            for group in group_entries.iter() {
+                EntryTree::insert_group(&mut tree, group);
+            }
+
+            tree
+        };
+
+        // Filter after inserting groups so that we can properly use groups'
+        // display names.
+        EntryTree::retain(&mut tree, |entry_path| self.filter(entry_path));
+
+        // Quick exit without doing unnecessary work.
+        if tree.is_empty() {
+            return;
+        }
+
+        // Sorting is after filtering to compare fewer elements.
+        EntryTree::sort_by_attr(&mut tree, self.sorting_attr, self.reverse_sort);
+
+        let timer = match self.timer {
+            TimerKind::Os => Timer::Os,
+
+            TimerKind::Tsc => {
+                match Timer::get_tsc() {
+                    Ok(tsc) => tsc,
+                    Err(error) => {
+                        eprintln!("warning: CPU timestamp counter is unavailable ({error}), defaulting to OS");
+                        Timer::Os
+                    }
+                }
+            }
+        };
+
+        if action.is_bench() {
+            eprintln!("Timer precision: {}", timer.precision());
+        }
+
+        let shared_context = SharedContext { action, timer };
+
+        let column_widths = if action.is_bench() {
+            TreeColumn::ALL.map(|column| {
+                if column.is_last() {
+                    // The last column doesn't use padding.
+                    0
+                } else {
+                    EntryTree::common_column_width(&tree, column)
+                }
+            })
+        } else {
+            [0; TreeColumn::COUNT]
+        };
+
+        let tree_painter =
+            RefCell::new(TreePainter::new(EntryTree::max_name_span(&tree, 0), column_widths));
+
+        self.run_tree(action, &tree, &shared_context, None, &tree_painter);
+    }
+
+    fn run_tree(
+        &self,
+        action: Action,
+        tree: &[EntryTree],
+        shared_context: &SharedContext,
+        parent_options: Option<&BenchOptions>,
+        tree_painter: &RefCell<TreePainter>,
+    ) {
+        for (i, child) in tree.iter().enumerate() {
+            let is_last = i == tree.len() - 1;
+
+            let name = child.display_name();
+
+            let child_options = child.bench_options();
+
+            // Overwrite `parent_options` with `child_options` if applicable.
+            let options: BenchOptions;
+            let options: Option<&BenchOptions> = match (parent_options, child_options) {
+                (None, None) => None,
+                (Some(options), None) | (None, Some(options)) => Some(options),
+                (Some(parent_options), Some(child_options)) => {
+                    options = child_options.overwrite(parent_options);
+                    Some(&options)
+                }
+            };
+
+            match child {
+                EntryTree::Leaf { entry, args } => self.run_bench_entry(
+                    action,
+                    *entry,
+                    args.as_deref(),
+                    shared_context,
+                    options,
+                    tree_painter,
+                    is_last,
+                ),
+                EntryTree::Parent { children, .. } => {
+                    tree_painter.borrow_mut().start_parent(name, is_last);
+
+                    self.run_tree(action, children, shared_context, options, tree_painter);
+
+                    tree_painter.borrow_mut().finish_parent();
+                }
+            }
+        }
+    }
+
+    fn run_bench_entry(
+        &self,
+        action: Action,
+        bench_entry: AnyBenchEntry,
+        bench_arg_names: Option<&[&&str]>,
+        shared_context: &SharedContext,
+        entry_options: Option<&BenchOptions>,
+        tree_painter: &RefCell<TreePainter>,
+        is_last_entry: bool,
+    ) {
+        use crate::bench::BenchContext;
+
+        let entry_display_name = bench_entry.display_name();
+
+        // User runtime options override all other options.
+        let options: BenchOptions;
+        let options: &BenchOptions = match entry_options {
+            None => &self.bench_options,
+            Some(entry_options) => {
+                options = self.bench_options.overwrite(entry_options);
+                &options
+            }
+        };
+
+        if self.should_ignore(options.ignore.unwrap_or_default()) {
+            tree_painter.borrow_mut().ignore_leaf(entry_display_name, is_last_entry);
+            return;
+        }
+
+        // Paint empty leaf when simply listing.
+        if action.is_list() {
+            let mut tree_painter = tree_painter.borrow_mut();
+            tree_painter.start_leaf(entry_display_name, is_last_entry);
+            tree_painter.finish_empty_leaf();
+            return;
+        }
+
+        let mut thread_counts: Vec<NonZeroUsize> = options
+            .threads
+            .as_deref()
+            .unwrap_or_default()
+            .iter()
+            .map(|&n| match NonZeroUsize::new(n) {
+                Some(n) => n,
+                None => crate::util::known_parallelism(),
+            })
+            .collect();
+
+        thread_counts.sort_unstable();
+        thread_counts.dedup();
+
+        let thread_counts: &[NonZeroUsize] =
+            if thread_counts.is_empty() { &[NonZeroUsize::MIN] } else { &thread_counts };
+
+        // Whether we should emit child branches for thread counts.
+        let has_thread_branches = thread_counts.len() > 1;
+
+        let run_bench = |bench_display_name: &str,
+                         is_last_bench: bool,
+                         with_bencher: &dyn Fn(Bencher)| {
+            if has_thread_branches {
+                tree_painter.borrow_mut().start_parent(bench_display_name, is_last_bench);
+            } else {
+                tree_painter.borrow_mut().start_leaf(bench_display_name, is_last_bench);
+            }
+
+            for (i, &thread_count) in thread_counts.iter().enumerate() {
+                let is_last_thread_count =
+                    if has_thread_branches { i == thread_counts.len() - 1 } else { is_last_bench };
+
+                if has_thread_branches {
+                    tree_painter
+                        .borrow_mut()
+                        .start_leaf(&format!("t={thread_count}"), is_last_thread_count);
+                }
+
+                let mut bench_context = BenchContext::new(shared_context, options, thread_count);
+                with_bencher(Bencher::new(&mut bench_context));
+
+                if !bench_context.did_run {
+                    eprintln!(
+                        "warning: No benchmark function registered for '{bench_display_name}'"
+                    );
+                }
+
+                let should_compute_stats =
+                    bench_context.did_run && shared_context.action.is_bench();
+
+                if should_compute_stats {
+                    let stats = bench_context.compute_stats();
+                    {
+                        let name = bench_entry.display_name().to_string();
+                        let file = bench_entry.meta().location.file;
+                        let mut module_path = bench_entry
+                            .meta()
+                            .module_path_components()
+                            .skip(1)
+                            .collect::<Vec<_>>()
+                            .join("::");
+                        if !module_path.is_empty() {
+                            module_path.push_str("::");
+                        }
+                        let uri = format!("{file}::{module_path}{name}");
+                        let iter_per_round = bench_context.samples.sample_size;
+                        let times_ns: Vec<_> = bench_context
+                            .samples
+                            .time_samples
+                            .iter()
+                            .map(|s| s.duration.picos / 1_000)
+                            .collect();
+                        let max_time_ns = options.max_time.map(|t| t.as_nanos());
+                        ::codspeed::walltime::collect_raw_walltime_results(
+                            "divan",
+                            name,
+                            uri,
+                            iter_per_round,
+                            max_time_ns,
+                            times_ns,
+                        );
+                    };
+                    tree_painter.borrow_mut().finish_leaf(
+                        is_last_thread_count,
+                        &stats,
+                        self.bytes_format,
+                    );
+                } else {
+                    tree_painter.borrow_mut().finish_empty_leaf();
+                }
+            }
+
+            if has_thread_branches {
+                tree_painter.borrow_mut().finish_parent();
+            }
+        };
+
+        match bench_entry.bench_runner() {
+            BenchEntryRunner::Plain(bench) => run_bench(entry_display_name, is_last_entry, bench),
+
+            BenchEntryRunner::Args(bench_runner) => {
+                tree_painter.borrow_mut().start_parent(entry_display_name, is_last_entry);
+
+                let bench_runner = bench_runner();
+                let orig_arg_names = bench_runner.arg_names();
+                let bench_arg_names = bench_arg_names.unwrap_or_default();
+
+                for (i, &arg_name) in bench_arg_names.iter().enumerate() {
+                    let is_last_arg = i == bench_arg_names.len() - 1;
+                    let arg_index = util::slice_ptr_index(orig_arg_names, arg_name);
+
+                    run_bench(arg_name, is_last_arg, &|bencher| {
+                        bench_runner.bench(bencher, arg_index);
+                    });
+                }
+
+                tree_painter.borrow_mut().finish_parent();
+            }
+        }
+    }
+}
+
+/// Makes `Divan::skip_regex` input polymorphic.
+pub trait SkipRegex {
+    fn skip_regex(self, divan: &mut Divan);
+}
+
+impl SkipRegex for Regex {
+    fn skip_regex(self, divan: &mut Divan) {
+        divan.skip_filters.push(Filter::Regex(self));
+    }
+}
+
+impl SkipRegex for &str {
+    #[track_caller]
+    fn skip_regex(self, divan: &mut Divan) {
+        Regex::new(self).unwrap().skip_regex(divan);
+    }
+}
+
+impl SkipRegex for String {
+    #[track_caller]
+    fn skip_regex(self, divan: &mut Divan) {
+        self.as_str().skip_regex(divan)
+    }
+}
+
+/// Configuration options.
+impl Divan {
+    /// Creates an instance with options set by parsing CLI arguments.
+    pub fn from_args() -> Self {
+        Self::default().config_with_args()
+    }
+
+    /// Sets options by parsing CLI arguments.
+    ///
+    /// This may override any previously-set options.
+    #[must_use]
+    pub fn config_with_args(mut self) -> Self {
+        let mut command = crate::cli::command();
+
+        let matches = command.get_matches_mut();
+        let is_exact = matches.get_flag("exact");
+
+        let mut parse_filter = |filter: &String| {
+            if is_exact {
+                Filter::Exact(filter.to_owned())
+            } else {
+                match Regex::new(filter) {
+                    Ok(r) => Filter::Regex(r),
+                    Err(error) => {
+                        let kind = clap::error::ErrorKind::ValueValidation;
+                        command.error(kind, error).exit();
+                    }
+                }
+            }
+        };
+
+        if let Some(filters) = matches.get_many::<String>("filter") {
+            self.filters.extend(filters.map(&mut parse_filter));
+        }
+
+        if let Some(skip_filters) = matches.get_many::<String>("skip") {
+            self.skip_filters.extend(skip_filters.map(&mut parse_filter));
+        }
+
+        self.action = if matches.get_flag("list") {
+            Action::List
+        } else if matches.get_flag("test") || !matches.get_flag("bench") {
+            // Either of:
+            // `cargo bench -- --test`
+            // `cargo test --benches`
+            Action::Test
+        } else {
+            Action::Bench
+        };
+
+        if let Some(&color) = matches.get_one("color") {
+            self.color = color;
+        }
+
+        if matches.get_flag("ignored") {
+            self.run_ignored = RunIgnored::Only;
+        } else if matches.get_flag("include-ignored") {
+            self.run_ignored = RunIgnored::Yes;
+        }
+
+        if let Some(&timer) = matches.get_one("timer") {
+            self.timer = timer;
+        }
+
+        if let Some(&sorting_attr) = matches.get_one("sortr") {
+            self.reverse_sort = true;
+            self.sorting_attr = sorting_attr;
+        } else if let Some(&sorting_attr) = matches.get_one("sort") {
+            self.reverse_sort = false;
+            self.sorting_attr = sorting_attr;
+        }
+
+        if let Some(&sample_count) = matches.get_one("sample-count") {
+            self.bench_options.sample_count = Some(sample_count);
+        }
+
+        if let Some(&sample_size) = matches.get_one("sample-size") {
+            self.bench_options.sample_size = Some(sample_size);
+        }
+
+        if let Some(thread_counts) = matches.get_many::<usize>("threads") {
+            let mut threads: Vec<usize> = thread_counts.copied().collect();
+            threads.sort_unstable();
+            threads.dedup();
+            self.bench_options.threads = Some(Cow::Owned(threads));
+        }
+
+        if let Some(&ParsedSeconds(min_time)) = matches.get_one("min-time") {
+            self.bench_options.min_time = Some(min_time);
+        }
+
+        if let Some(&ParsedSeconds(max_time)) = matches.get_one("max-time") {
+            self.bench_options.max_time = Some(max_time);
+        }
+
+        if let Some(mut skip_ext_time) = matches.get_many::<bool>("skip-ext-time") {
+            // If the option is present without a value, then it's `true`.
+            self.bench_options.skip_ext_time =
+                Some(matches!(skip_ext_time.next(), Some(true) | None));
+        }
+
+        if let Some(&count) = matches.get_one::<MaxCountUInt>("items-count") {
+            self.counter_mut(ItemsCount::new(count));
+        }
+
+        if let Some(&count) = matches.get_one::<MaxCountUInt>("bytes-count") {
+            self.counter_mut(BytesCount::new(count));
+        }
+
+        if let Some(&PrivBytesFormat(bytes_format)) = matches.get_one("bytes-format") {
+            self.bytes_format = bytes_format;
+        }
+
+        if let Some(&count) = matches.get_one::<MaxCountUInt>("chars-count") {
+            self.counter_mut(CharsCount::new(count));
+        }
+
+        if let Some(&count) = matches.get_one::<MaxCountUInt>("cycles-count") {
+            self.counter_mut(CyclesCount::new(count));
+        }
+
+        self
+    }
+
+    /// Sets whether output should be colored.
+    ///
+    /// This option is equivalent to the `--color` CLI argument, where [`None`]
+    /// here means "auto".
+    #[must_use]
+    pub fn color(mut self, yes: impl Into<Option<bool>>) -> Self {
+        self.color = match yes.into() {
+            None => ColorChoice::Auto,
+            Some(true) => ColorChoice::Always,
+            Some(false) => ColorChoice::Never,
+        };
+        self
+    }
+
+    /// Also run benchmarks marked [`#[ignore]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-ignore-attribute).
+    ///
+    /// This option is equivalent to the `--include-ignored` CLI argument.
+    #[must_use]
+    pub fn run_ignored(mut self) -> Self {
+        self.run_ignored = RunIgnored::Yes;
+        self
+    }
+
+    /// Only run benchmarks marked [`#[ignore]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-ignore-attribute).
+    ///
+    /// This option is equivalent to the `--ignored` CLI argument.
+    #[must_use]
+    pub fn run_only_ignored(mut self) -> Self {
+        self.run_ignored = RunIgnored::Only;
+        self
+    }
+
+    /// Skips benchmarks that match `filter` as a regular expression pattern.
+    ///
+    /// This option is equivalent to the `--skip filter` CLI argument, without
+    /// `--exact`.
+    ///
+    /// # Examples
+    ///
+    /// This method is commonly used with a [`&str`](prim@str) or [`String`]:
+    ///
+    /// ```
+    /// # use divan::Divan;
+    /// let filter = "(add|sub)";
+    /// let divan = Divan::default().skip_regex(filter);
+    /// ```
+    ///
+    /// A pre-built [`Regex`] can also be provided:
+    ///
+    /// ```
+    /// # use divan::Divan;
+    /// let filter = regex::Regex::new("(add|sub)").unwrap();
+    /// let divan = Divan::default().skip_regex(filter);
+    /// ```
+    ///
+    /// Calling this repeatedly will add multiple skip filters:
+    ///
+    /// ```
+    /// # use divan::Divan;
+    /// let divan = Divan::default()
+    ///     .skip_regex("(add|sub)")
+    ///     .skip_regex("collections.*default");
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// Panics if `filter` is a string and [`Regex::new`] fails.
+    #[must_use]
+    pub fn skip_regex(mut self, filter: impl SkipRegex) -> Self {
+        filter.skip_regex(&mut self);
+        self
+    }
+
+    /// Skips benchmarks that exactly match `filter`.
+    ///
+    /// This option is equivalent to the `--skip filter --exact` CLI arguments.
+    ///
+    /// # Examples
+    ///
+    /// This method is commonly used with a [`&str`](prim@str) or [`String`]:
+    ///
+    /// ```
+    /// # use divan::Divan;
+    /// let filter = "arithmetic::add";
+    /// let divan = Divan::default().skip_exact(filter);
+    /// ```
+    ///
+    /// Calling this repeatedly will add multiple skip filters:
+    ///
+    /// ```
+    /// # use divan::Divan;
+    /// let divan = Divan::default()
+    ///     .skip_exact("arithmetic::add")
+    ///     .skip_exact("collections::vec::default");
+    /// ```
+    #[must_use]
+    pub fn skip_exact(mut self, filter: impl Into<String>) -> Self {
+        self.skip_filters.push(Filter::Exact(filter.into()));
+        self
+    }
+
+    /// Sets the number of sampling iterations.
+    ///
+    /// This option is equivalent to the `--sample-count` CLI argument.
+    ///
+    /// If a benchmark enables [`threads`](macro@crate::bench#threads), sample
+    /// count becomes a multiple of the number of threads. This is because each
+    /// thread operates over the same sample size to ensure there are always N
+    /// competing threads doing the same amount of work.
+    #[inline]
+    pub fn sample_count(mut self, count: u32) -> Self {
+        self.bench_options.sample_count = Some(count);
+        self
+    }
+
+    /// Sets the number of iterations inside a single sample.
+    ///
+    /// This option is equivalent to the `--sample-size` CLI argument.
+    #[inline]
+    pub fn sample_size(mut self, count: u32) -> Self {
+        self.bench_options.sample_size = Some(count);
+        self
+    }
+
+    /// Run across multiple threads.
+    ///
+    /// This enables you to measure contention on [atomics and
+    /// locks](std::sync). A value of 0 indicates [available
+    /// parallelism](std::thread::available_parallelism).
+    ///
+    /// This option is equivalent to the `--threads` CLI argument or
+    /// `DIVAN_THREADS` environment variable.
+    #[inline]
+    pub fn threads<T>(mut self, threads: T) -> Self
+    where
+        T: IntoIterator<Item = usize>,
+    {
+        self.bench_options.threads = {
+            let mut threads: Vec<usize> = threads.into_iter().collect();
+            threads.sort_unstable();
+            threads.dedup();
+            Some(Cow::Owned(threads))
+        };
+        self
+    }
+
+    /// Sets the time floor for benchmarking a function.
+    ///
+    /// This option is equivalent to the `--min-time` CLI argument.
+    #[inline]
+    pub fn min_time(mut self, time: Duration) -> Self {
+        self.bench_options.min_time = Some(time);
+        self
+    }
+
+    /// Sets the time ceiling for benchmarking a function.
+    ///
+    /// This option is equivalent to the `--max-time` CLI argument.
+    #[inline]
+    pub fn max_time(mut self, time: Duration) -> Self {
+        self.bench_options.max_time = Some(time);
+        self
+    }
+
+    /// When accounting for `min_time` or `max_time`, skip time external to
+    /// benchmarked functions.
+    ///
+    /// This option is equivalent to the `--skip-ext-time` CLI argument.
+    #[inline]
+    pub fn skip_ext_time(mut self, skip: bool) -> Self {
+        self.bench_options.skip_ext_time = Some(skip);
+        self
+    }
+}
+
+/// Use [`Counter`s](crate::counter::Counter) to get throughput across all
+/// benchmarks.
+impl Divan {
+    #[inline]
+    fn counter_mut<C: IntoCounter>(&mut self, counter: C) -> &mut Self {
+        self.bench_options.counters.insert(counter);
+        self
+    }
+
+    /// Counts the number of values processed.
+    #[inline]
+    pub fn counter<C: IntoCounter>(mut self, counter: C) -> Self {
+        self.counter_mut(counter);
+        self
+    }
+
+    /// Sets the number of items processed.
+    ///
+    /// This option is equivalent to the `--items-count` CLI argument or
+    /// `DIVAN_ITEMS_COUNT` environment variable.
+    #[inline]
+    pub fn items_count<C: Into<ItemsCount>>(self, count: C) -> Self {
+        self.counter(count.into())
+    }
+
+    /// Sets the number of bytes processed.
+    ///
+    /// This option is equivalent to the `--bytes-count` CLI argument or
+    /// `DIVAN_BYTES_COUNT` environment variable.
+    #[inline]
+    pub fn bytes_count<C: Into<BytesCount>>(self, count: C) -> Self {
+        self.counter(count.into())
+    }
+
+    /// Determines how [`BytesCount`] is scaled in benchmark outputs.
+    ///
+    /// This option is equivalent to the `--bytes-format` CLI argument or
+    /// `DIVAN_BYTES_FORMAT` environment variable.
+    #[inline]
+    pub fn bytes_format(mut self, format: BytesFormat) -> Self {
+        self.bytes_format = format;
+        self
+    }
+
+    /// Sets the number of bytes processed.
+    ///
+    /// This option is equivalent to the `--chars-count` CLI argument or
+    /// `DIVAN_CHARS_COUNT` environment variable.
+    #[inline]
+    pub fn chars_count<C: Into<CharsCount>>(self, count: C) -> Self {
+        self.counter(count.into())
+    }
+
+    /// Sets the number of cycles processed, displayed as Hertz.
+    ///
+    /// This option is equivalent to the `--cycles-count` CLI argument or
+    /// `DIVAN_CYCLES_COUNT` environment variable.
+    #[inline]
+    pub fn cycles_count<C: Into<CyclesCount>>(self, count: C) -> Self {
+        self.counter(count.into())
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/entry/generic.rs b/crates/divan_compat/divan_fork/src/entry/generic.rs
new file mode 100644
index 00000000..75cc5a2a
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/entry/generic.rs
@@ -0,0 +1,180 @@
+use std::{
+    any::{Any, TypeId},
+    cmp::Ordering,
+    mem::ManuallyDrop,
+    sync::OnceLock,
+};
+
+use crate::{
+    entry::{BenchEntryRunner, GroupEntry},
+    util::sort::natural_cmp,
+};
+
+/// Compile-time entry for a generic benchmark function, generated by
+/// `#[divan::bench]`.
+///
+/// Unlike `BenchEntry`, this is for a specific generic type or `const`.
+///
+/// Although this type contains trivially-`Copy` data, it *should not* implement
+/// `Clone` because the memory address of each instance is used to determine the
+/// relative order in `GroupEntry.generic_benches` when sorting benchmarks by
+/// location.
+pub struct GenericBenchEntry {
+    /// The associated group, for entry metadata.
+    pub group: &'static GroupEntry,
+
+    /// The benchmarking function.
+    pub bench: BenchEntryRunner,
+
+    /// A generic type.
+    pub ty: Option<EntryType>,
+
+    /// A `const` value and associated data.
+    pub const_value: Option<EntryConst>,
+}
+
+impl GenericBenchEntry {
+    pub(crate) fn raw_name(&self) -> &str {
+        match (&self.ty, &self.const_value) {
+            (_, Some(const_value)) => const_value.name(),
+            (Some(ty), None) => ty.raw_name(),
+            (None, None) => unreachable!(),
+        }
+    }
+
+    pub(crate) fn display_name(&self) -> &str {
+        match (&self.ty, &self.const_value) {
+            (_, Some(const_value)) => const_value.name(),
+            (Some(ty), None) => ty.display_name(),
+            (None, None) => unreachable!(),
+        }
+    }
+
+    pub(crate) fn path_components(&self) -> impl Iterator<Item = &str> {
+        let module_path = self.group.meta.module_path_components();
+
+        // Generic benchmarks consider their group's raw name to be the path
+        // component after the module path.
+        let group_component = self.group.meta.raw_name;
+
+        // If this is a generic const benchmark with generic types, the generic
+        // types are considered to be the parent of the const values.
+        let type_component = if self.const_value.is_some() {
+            // FIXME: Switch back to `raw_name` once we have a way to insert
+            // this `display_name` into `EntryTree::Parent`. The current
+            // approach allows different types with the same name to become the
+            // same `EntryTree::Parent`.
+            self.ty.as_ref().map(|ty| ty.display_name())
+        } else {
+            None
+        };
+
+        module_path.chain(Some(group_component)).chain(type_component)
+    }
+}
+
+/// Generic type instantiation.
+pub struct EntryType {
+    /// [`std::any::type_name`].
+    get_type_name: fn() -> &'static str,
+
+    /// [`std::any::TypeId::of`].
+    #[allow(dead_code)]
+    get_type_id: fn() -> TypeId,
+}
+
+impl EntryType {
+    /// Creates an instance for the given type.
+    pub const fn new<T: Any>() -> Self {
+        Self { get_type_name: std::any::type_name::<T>, get_type_id: TypeId::of::<T> }
+    }
+
+    pub(crate) fn raw_name(&self) -> &'static str {
+        (self.get_type_name)()
+    }
+
+    pub(crate) fn display_name(&self) -> &'static str {
+        let mut type_name = self.raw_name();
+
+        // Remove module components in type name.
+        while let Some((prev, next)) = type_name.split_once("::") {
+            // Do not go past generic type boundary.
+            if prev.contains('<') {
+                break;
+            }
+            type_name = next;
+        }
+
+        type_name
+    }
+}
+
+/// A reference to a `const` as a `&'static T`.
+pub struct EntryConst {
+    /// `&'static T`.
+    value: *const (),
+
+    /// [`PartialOrd::partial_cmp`].
+    partial_cmp: unsafe fn(*const (), *const ()) -> Option<Ordering>,
+
+    /// [`ToString::to_string`].
+    to_string: unsafe fn(*const ()) -> String,
+
+    /// Cached `to_string` result.
+    cached_string: ManuallyDrop<OnceLock<&'static str>>,
+}
+
+// SAFETY: `T: Send + Sync`.
+unsafe impl Send for EntryConst {}
+unsafe impl Sync for EntryConst {}
+
+impl EntryConst {
+    /// Creates entry data for a `const` values.
+    pub const fn new<T>(value: &'static T) -> Self
+    where
+        T: PartialOrd + ToString + Send + Sync,
+    {
+        unsafe fn partial_cmp<T: PartialOrd>(a: *const (), b: *const ()) -> Option<Ordering> {
+            T::partial_cmp(&*a.cast(), &*b.cast())
+        }
+
+        unsafe fn to_string<T: ToString>(value: *const ()) -> String {
+            T::to_string(&*value.cast())
+        }
+
+        Self {
+            value: value as *const T as *const (),
+            partial_cmp: partial_cmp::<T>,
+            to_string: to_string::<T>,
+            cached_string: ManuallyDrop::new(OnceLock::new()),
+        }
+    }
+
+    /// Returns [`PartialOrd::partial_cmp`] ordering if `<` or `>, falling back
+    /// to comparing [`ToString::to_string`] otherwise.
+    pub(crate) fn cmp_name(&self, other: &Self) -> Ordering {
+        if self.partial_cmp == other.partial_cmp {
+            // SAFETY: Both constants have the same comparison function, so they
+            // must be the same type.
+            if let Some(ordering) = unsafe { (self.partial_cmp)(self.value, other.value) } {
+                if !ordering.is_eq() {
+                    return ordering;
+                }
+            }
+        }
+
+        // Fallback to name comparison.
+        natural_cmp(self.name(), other.name())
+    }
+
+    /// [`ToString::to_string`].
+    #[inline]
+    pub(crate) fn name(&self) -> &str {
+        self.cached_string.get_or_init(|| {
+            // SAFETY: The function is guaranteed to call `T::to_string`.
+            let string = unsafe { (self.to_string)(self.value) };
+
+            Box::leak(string.into_boxed_str())
+        })
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/entry/list.rs b/crates/divan_compat/divan_fork/src/entry/list.rs
new file mode 100644
index 00000000..5ad06bd8
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/entry/list.rs
@@ -0,0 +1,79 @@
+use std::{
+    ptr,
+    sync::atomic::{AtomicPtr, Ordering as AtomicOrdering},
+};
+
+/// Linked list of entries.
+///
+/// This is implemented in a thread-safe way despite the fact that constructors
+/// are run single-threaded.
+pub struct EntryList<T: 'static> {
+    entry: Option<&'static T>,
+    next: AtomicPtr<Self>,
+}
+
+impl<T> EntryList<T> {
+    pub(crate) const fn root() -> Self {
+        Self { entry: None, next: AtomicPtr::new(ptr::null_mut()) }
+    }
+
+    /// Dereferences the `next` pointer.
+    #[inline]
+    fn next(&self) -> Option<&Self> {
+        // SAFETY: `next` is only assigned by `push`, which always receives a
+        // 'static lifetime.
+        unsafe { self.next.load(AtomicOrdering::Relaxed).as_ref() }
+    }
+}
+
+// Externally used by macros or tests.
+#[allow(missing_docs)]
+impl<T> EntryList<T> {
+    #[inline]
+    pub const fn new(entry: &'static T) -> Self {
+        Self { entry: Some(entry), next: AtomicPtr::new(ptr::null_mut()) }
+    }
+
+    /// Creates an iterator over entries in `self`.
+    #[inline]
+    pub fn iter(&self) -> impl Iterator<Item = &T> {
+        let mut list = Some(self);
+        std::iter::from_fn(move || -> Option<Option<&T>> {
+            let current = list?;
+            list = current.next();
+            Some(current.entry.as_ref().copied())
+        })
+        .flatten()
+    }
+
+    /// Inserts `other` to the front of the list.
+    ///
+    /// # Safety
+    ///
+    /// This function must be safe to call before `main`.
+    #[inline]
+    pub fn push(&'static self, other: &'static Self) {
+        let mut old_next = self.next.load(AtomicOrdering::Relaxed);
+        loop {
+            // Each publicly-created instance has `list.next` be null, so we can
+            // simply store `self.next` there.
+            other.next.store(old_next, AtomicOrdering::Release);
+
+            // SAFETY: The content of `other` can already be seen, so we don't
+            // need to strongly order reads into it.
+            let other = other as *const Self as *mut Self;
+            match self.next.compare_exchange_weak(
+                old_next,
+                other,
+                AtomicOrdering::AcqRel,
+                AtomicOrdering::Acquire,
+            ) {
+                // Successfully wrote our thread's value to the list.
+                Ok(_) => return,
+
+                // Lost the race, store winner's value in `other.next`.
+                Err(new) => old_next = new,
+            }
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/entry/meta.rs b/crates/divan_compat/divan_fork/src/entry/meta.rs
new file mode 100644
index 00000000..be75c855
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/entry/meta.rs
@@ -0,0 +1,44 @@
+use std::sync::LazyLock;
+
+use crate::bench::BenchOptions;
+
+/// Metadata common to `#[divan::bench]` and `#[divan::bench_group]`.
+pub struct EntryMeta {
+    /// The entry's display name.
+    pub display_name: &'static str,
+
+    /// The entry's original name.
+    ///
+    /// This is used to find a `GroupEntry` for a `BenchEntry`.
+    pub raw_name: &'static str,
+
+    /// The entry's raw `module_path!()`.
+    pub module_path: &'static str,
+
+    /// Where the entry was defined.
+    pub location: EntryLocation,
+
+    /// Configures the benchmarker via attribute options.
+    pub bench_options: Option<LazyLock<BenchOptions<'static>>>,
+}
+
+/// Where an entry is located.
+#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
+#[allow(missing_docs)]
+pub struct EntryLocation {
+    pub file: &'static str,
+    pub line: u32,
+    pub col: u32,
+}
+
+impl EntryMeta {
+    #[inline]
+    pub(crate) fn bench_options(&self) -> Option<&BenchOptions> {
+        self.bench_options.as_deref()
+    }
+
+    #[inline]
+    pub(crate) fn module_path_components<'a>(&self) -> impl Iterator<Item = &'a str> {
+        self.module_path.split("::")
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/entry/mod.rs b/crates/divan_compat/divan_fork/src/entry/mod.rs
new file mode 100644
index 00000000..2070f63f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/entry/mod.rs
@@ -0,0 +1,126 @@
+use std::ptr::NonNull;
+
+use crate::{bench::BenchArgsRunner, Bencher};
+
+mod generic;
+mod list;
+mod meta;
+mod tree;
+
+pub use self::{
+    generic::{EntryConst, EntryType, GenericBenchEntry},
+    list::EntryList,
+    meta::{EntryLocation, EntryMeta},
+};
+pub(crate) use tree::EntryTree;
+
+/// Benchmark entries generated by `#[divan::bench]`.
+///
+/// Note: generic-type benchmark entries are instead stored in `GROUP_ENTRIES`
+/// in `generic_benches`.
+pub static BENCH_ENTRIES: EntryList<BenchEntry> = EntryList::root();
+
+/// Group entries generated by `#[divan::bench_group]`.
+pub static GROUP_ENTRIES: EntryList<GroupEntry> = EntryList::root();
+
+/// Determines how the benchmark entry is run.
+#[derive(Clone, Copy)]
+pub enum BenchEntryRunner {
+    /// Benchmark without arguments.
+    Plain(fn(Bencher)),
+
+    /// Benchmark with runtime arguments.
+    Args(fn() -> BenchArgsRunner),
+}
+
+/// Compile-time entry for a benchmark, generated by `#[divan::bench]`.
+pub struct BenchEntry {
+    /// Entry metadata.
+    pub meta: EntryMeta,
+
+    /// The benchmarking function.
+    pub bench: BenchEntryRunner,
+}
+
+/// Compile-time entry for a benchmark group, generated by
+/// `#[divan::bench_group]` or a generic-type `#[divan::bench]`.
+pub struct GroupEntry {
+    /// Entry metadata.
+    pub meta: EntryMeta,
+
+    /// Generic `#[divan::bench]` entries.
+    ///
+    /// This is two-dimensional to make code generation simpler. The outer
+    /// dimension corresponds to types and the inner dimension corresponds to
+    /// constants.
+    pub generic_benches: Option<&'static [&'static [GenericBenchEntry]]>,
+}
+
+impl GroupEntry {
+    pub(crate) fn generic_benches_iter(&self) -> impl Iterator<Item = &'static GenericBenchEntry> {
+        self.generic_benches.unwrap_or_default().iter().flat_map(|benches| benches.iter())
+    }
+}
+
+/// `BenchEntry` or `GenericBenchEntry`.
+#[derive(Clone, Copy)]
+pub(crate) enum AnyBenchEntry<'a> {
+    Bench(&'a BenchEntry),
+    GenericBench(&'a GenericBenchEntry),
+}
+
+impl<'a> AnyBenchEntry<'a> {
+    /// Returns a pointer to use as the identity of the entry.
+    #[inline]
+    pub fn entry_addr(self) -> NonNull<()> {
+        match self {
+            Self::Bench(entry) => NonNull::from(entry).cast(),
+            Self::GenericBench(entry) => NonNull::from(entry).cast(),
+        }
+    }
+
+    /// Returns this entry's benchmark runner.
+    #[inline]
+    pub fn bench_runner(self) -> &'a BenchEntryRunner {
+        match self {
+            Self::Bench(BenchEntry { bench, .. })
+            | Self::GenericBench(GenericBenchEntry { bench, .. }) => bench,
+        }
+    }
+
+    /// Returns this entry's argument names.
+    #[inline]
+    pub fn arg_names(self) -> Option<&'static [&'static str]> {
+        match self.bench_runner() {
+            BenchEntryRunner::Args(bench_runner) => {
+                let bench_runner = bench_runner();
+                Some(bench_runner.arg_names())
+            }
+            _ => None,
+        }
+    }
+
+    #[inline]
+    pub fn meta(self) -> &'a EntryMeta {
+        match self {
+            Self::Bench(entry) => &entry.meta,
+            Self::GenericBench(entry) => &entry.group.meta,
+        }
+    }
+
+    #[inline]
+    pub fn raw_name(self) -> &'a str {
+        match self {
+            Self::Bench(entry) => entry.meta.raw_name,
+            Self::GenericBench(entry) => entry.raw_name(),
+        }
+    }
+
+    #[inline]
+    pub fn display_name(self) -> &'a str {
+        match self {
+            Self::Bench(entry) => entry.meta.display_name,
+            Self::GenericBench(entry) => entry.display_name(),
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/entry/tree.rs b/crates/divan_compat/divan_fork/src/entry/tree.rs
new file mode 100644
index 00000000..1cd31ee8
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/entry/tree.rs
@@ -0,0 +1,412 @@
+use std::{cmp::Ordering, ptr::NonNull};
+
+use crate::{
+    bench::{BenchOptions, DEFAULT_SAMPLE_COUNT},
+    config::SortingAttr,
+    counter::KnownCounterKind,
+    entry::{AnyBenchEntry, EntryLocation, EntryMeta, GenericBenchEntry, GroupEntry},
+    tree_painter::TreeColumn,
+    util::sort::natural_cmp,
+};
+
+/// `BenchEntry` tree organized by path components.
+pub(crate) enum EntryTree<'a> {
+    /// Benchmark group; parent to leaves and other parents.
+    Parent { raw_name: &'a str, group: Option<&'a GroupEntry>, children: Vec<Self> },
+
+    /// Benchmark entry leaf.
+    Leaf {
+        /// The benchmark entry being run.
+        entry: AnyBenchEntry<'a>,
+
+        /// The names of arguments to run.
+        args: Option<Vec<&'static &'static str>>,
+    },
+}
+
+impl<'a> EntryTree<'a> {
+    /// Constructs a tree from an iterator of benchmark entries in the order
+    /// they're produced.
+    pub fn from_benches<I>(benches: I) -> Vec<Self>
+    where
+        I: IntoIterator<Item = AnyBenchEntry<'a>>,
+    {
+        let mut result = Vec::<Self>::new();
+
+        for bench in benches {
+            let mut insert_entry = |path_iter| {
+                Self::insert_entry(&mut result, bench, path_iter);
+            };
+
+            match bench {
+                AnyBenchEntry::Bench(bench) => {
+                    insert_entry(&mut bench.meta.module_path_components());
+                }
+                AnyBenchEntry::GenericBench(bench) => {
+                    insert_entry(&mut bench.path_components());
+                }
+            }
+        }
+
+        result
+    }
+
+    /// Returns the maximum span for a name in `tree`.
+    ///
+    /// This is the number of terminal columns used for labeling benchmark names
+    /// prior to emitting stats columns.
+    pub fn max_name_span(tree: &[Self], depth: usize) -> usize {
+        // The number of terminal columns used per-depth for box drawing
+        // characters. For example, "│  ╰─ " is 6 for depth 2.
+        const DEPTH_COLS: usize = 3;
+
+        tree.iter()
+            .map(|node| {
+                let node_name_span = {
+                    let prefix_len = depth * DEPTH_COLS;
+                    let name_len = node.display_name().chars().count();
+                    prefix_len + name_len
+                };
+
+                // The maximum span of any descendent.
+                let children_max_span = Self::max_name_span(node.children(), depth + 1);
+
+                // The maximum span of any runtime argument.
+                let args_max_span = node
+                    .arg_names()
+                    .unwrap_or_default()
+                    .iter()
+                    .map(|arg| {
+                        let prefix_len = (depth + 1) * DEPTH_COLS;
+                        let name_len = arg.chars().count();
+                        prefix_len + name_len
+                    })
+                    .max()
+                    .unwrap_or_default();
+
+                node_name_span.max(children_max_span).max(args_max_span)
+            })
+            .max()
+            .unwrap_or_default()
+    }
+
+    /// Returns the likely span for a given column.
+    pub fn common_column_width(tree: &[Self], column: TreeColumn) -> usize {
+        // Time and throughput info.
+        if column.is_time_stat() {
+            return KnownCounterKind::MAX_COMMON_COLUMN_WIDTH;
+        }
+
+        tree.iter()
+            .map(|tree| {
+                let Some(options) = tree.bench_options() else {
+                    return 0;
+                };
+
+                let width = match column {
+                    TreeColumn::Samples => {
+                        let sample_count = options.sample_count.unwrap_or(DEFAULT_SAMPLE_COUNT);
+                        1 + sample_count.checked_ilog10().unwrap_or_default() as usize
+                    }
+
+                    // Iters is the last column, so it does not need pad width.
+                    // All other columns are time stats handled previously.
+                    _ => 0,
+                };
+
+                width.max(Self::common_column_width(tree.children(), column))
+            })
+            .max()
+            .unwrap_or_default()
+    }
+
+    /// Inserts the benchmark group into a tree.
+    ///
+    /// Groups are inserted after tree construction because it prevents having
+    /// parents without terminating leaves. Groups that do not match an existing
+    /// parent are not inserted.
+    pub fn insert_group(mut tree: &mut [Self], group: &'a GroupEntry) {
+        // Update `tree` to be the innermost set of subtrees whose parents match
+        // `group.module_path`.
+        'component: for component in group.meta.module_path_components() {
+            for subtree in tree {
+                match subtree {
+                    EntryTree::Parent { raw_name, children, .. } if component == *raw_name => {
+                        tree = children;
+                        continue 'component;
+                    }
+                    _ => {}
+                }
+            }
+
+            // No matches for this component in any subtrees.
+            return;
+        }
+
+        // Find the matching tree to insert the group into.
+        for subtree in tree {
+            match subtree {
+                EntryTree::Parent { raw_name, group: slot, .. }
+                    if group.meta.raw_name == *raw_name =>
+                {
+                    *slot = Some(group);
+                    return;
+                }
+                _ => {}
+            }
+        }
+    }
+
+    /// Removes entries from the tree whose paths do not match the filter.
+    pub fn retain(tree: &mut Vec<Self>, mut filter: impl FnMut(&str) -> bool) {
+        fn retain(
+            tree: &mut Vec<EntryTree>,
+            parent_path: &str,
+            filter: &mut impl FnMut(&str) -> bool,
+        ) {
+            tree.retain_mut(|subtree| {
+                let subtree_path: String;
+                let subtree_path: &str = if parent_path.is_empty() {
+                    subtree.display_name()
+                } else {
+                    subtree_path = format!("{parent_path}::{}", subtree.display_name());
+                    &subtree_path
+                };
+
+                match subtree {
+                    EntryTree::Parent { children, .. } => {
+                        retain(children, subtree_path, filter);
+
+                        // If no children exist, filter out this parent.
+                        !children.is_empty()
+                    }
+
+                    EntryTree::Leaf { args: None, .. } => filter(subtree_path),
+
+                    EntryTree::Leaf { args: Some(args), .. } => {
+                        args.retain(|arg| filter(&format!("{subtree_path}::{arg}")));
+
+                        // If no arguments exist, filter out this leaf.
+                        !args.is_empty()
+                    }
+                }
+            });
+        }
+        retain(tree, "", &mut filter);
+    }
+
+    /// Sorts the tree by the given ordering.
+    pub fn sort_by_attr(tree: &mut [Self], attr: SortingAttr, reverse: bool) {
+        let apply_reverse =
+            |ordering: Ordering| if reverse { ordering.reverse() } else { ordering };
+
+        tree.sort_unstable_by(|a, b| apply_reverse(a.cmp_by_attr(b, attr)));
+
+        tree.iter_mut().for_each(|tree| {
+            match tree {
+                // Sort benchmark arguments.
+                EntryTree::Leaf { args, .. } => {
+                    if let Some(args) = args {
+                        args.sort_by(|&a, &b| apply_reverse(attr.cmp_bench_arg_names(a, b)));
+                    }
+                }
+
+                // Sort children.
+                EntryTree::Parent { children, .. } => {
+                    Self::sort_by_attr(children, attr, reverse);
+                }
+            }
+        });
+    }
+
+    fn cmp_by_attr(&self, other: &Self, attr: SortingAttr) -> Ordering {
+        // We take advantage of the fact that entries have stable addresses,
+        // unlike `EntryTree`.
+        let entry_addr_ordering = match (self.entry_addr(), other.entry_addr()) {
+            (Some(a), Some(b)) => Some(a.cmp(&b)),
+            _ => None,
+        };
+
+        // If entries have the same address, then all attributes will be equal.
+        if matches!(entry_addr_ordering, Some(Ordering::Equal)) {
+            return Ordering::Equal;
+        }
+
+        for attr in attr.with_tie_breakers() {
+            let ordering = match attr {
+                SortingAttr::Kind => self.kind().cmp(&other.kind()),
+                SortingAttr::Name => self.cmp_display_name(other),
+                SortingAttr::Location => {
+                    let location_ordering = self.location().cmp(&other.location());
+
+                    // Use the entry's address to break location ties.
+                    //
+                    // This makes generic benchmarks use the same order as their
+                    // types and constants.
+                    if location_ordering.is_eq() {
+                        entry_addr_ordering.unwrap_or(Ordering::Equal)
+                    } else {
+                        location_ordering
+                    }
+                }
+            };
+
+            if ordering.is_ne() {
+                return ordering;
+            }
+        }
+
+        Ordering::Equal
+    }
+
+    /// Helper for constructing a tree.
+    ///
+    /// This uses recursion because the iterative approach runs into limitations
+    /// with mutable borrows.
+    fn insert_entry(
+        tree: &mut Vec<Self>,
+        entry: AnyBenchEntry<'a>,
+        rem_modules: &mut dyn Iterator<Item = &'a str>,
+    ) {
+        let Some(current_module) = rem_modules.next() else {
+            tree.push(Self::Leaf {
+                entry,
+                args: entry.arg_names().map(|args| args.iter().collect()),
+            });
+            return;
+        };
+
+        let Some(children) = Self::get_children(tree, current_module) else {
+            tree.push(Self::from_path(entry, current_module, rem_modules));
+            return;
+        };
+
+        Self::insert_entry(children, entry, rem_modules);
+    }
+
+    /// Constructs a sequence of branches from a module path.
+    fn from_path(
+        entry: AnyBenchEntry<'a>,
+        current_module: &'a str,
+        rem_modules: &mut dyn Iterator<Item = &'a str>,
+    ) -> Self {
+        let child = if let Some(next_module) = rem_modules.next() {
+            Self::from_path(entry, next_module, rem_modules)
+        } else {
+            Self::Leaf { entry, args: entry.arg_names().map(|args| args.iter().collect()) }
+        };
+        Self::Parent { raw_name: current_module, group: None, children: vec![child] }
+    }
+
+    /// Finds the `Parent.children` for the corresponding module in `tree`.
+    fn get_children<'t>(tree: &'t mut [Self], module: &str) -> Option<&'t mut Vec<Self>> {
+        tree.iter_mut().find_map(|tree| match tree {
+            Self::Parent { raw_name, children, group: _ } if *raw_name == module => Some(children),
+            _ => None,
+        })
+    }
+
+    /// Returns an integer denoting the enum variant.
+    ///
+    /// This is used instead of `std::mem::Discriminant` because it does not
+    /// implement `Ord`.
+    pub fn kind(&self) -> i32 {
+        // Leaves should appear before parents.
+        match self {
+            Self::Leaf { .. } => 0,
+            Self::Parent { .. } => 1,
+        }
+    }
+
+    /// Returns a pointer to use as the identity of the entry.
+    pub fn entry_addr(&self) -> Option<NonNull<()>> {
+        match self {
+            Self::Leaf { entry, .. } => Some(entry.entry_addr()),
+            Self::Parent { group, .. } => {
+                group.map(|entry: &GroupEntry| NonNull::from(entry).cast())
+            }
+        }
+    }
+
+    pub fn meta(&self) -> Option<&'a EntryMeta> {
+        match self {
+            Self::Parent { group, .. } => Some(&(*group)?.meta),
+            Self::Leaf { entry, .. } => Some(entry.meta()),
+        }
+    }
+
+    pub fn bench_options(&self) -> Option<&'a BenchOptions> {
+        self.meta()?.bench_options()
+    }
+
+    pub fn raw_name(&self) -> &'a str {
+        match self {
+            Self::Parent { group: Some(group), .. } => group.meta.raw_name,
+            Self::Parent { raw_name, .. } => raw_name,
+            Self::Leaf { entry, .. } => entry.raw_name(),
+        }
+    }
+
+    pub fn display_name(&self) -> &'a str {
+        if let Self::Leaf { entry, .. } = self {
+            entry.display_name()
+        } else if let Some(common) = self.meta() {
+            common.display_name
+        } else {
+            let raw_name = self.raw_name();
+            raw_name.strip_prefix("r#").unwrap_or(raw_name)
+        }
+    }
+
+    /// Returns the location of this entry, group, or the children's earliest
+    /// location.
+    fn location(&self) -> Option<&'a EntryLocation> {
+        if let Some(common) = self.meta() {
+            Some(&common.location)
+        } else {
+            self.children().iter().flat_map(Self::location).min()
+        }
+    }
+
+    /// Compares display names naturally, taking into account integers.
+    ///
+    /// There is special consideration for the `PartialOrd` implementation of
+    /// constants, so that `EntryConst` can sort integers and floats by value
+    /// instead of lexicographically.
+    fn cmp_display_name(&self, other: &Self) -> Ordering {
+        match (self, other) {
+            (
+                Self::Leaf {
+                    entry:
+                        AnyBenchEntry::GenericBench(GenericBenchEntry {
+                            const_value: Some(this), ..
+                        }),
+                    ..
+                },
+                Self::Leaf {
+                    entry:
+                        AnyBenchEntry::GenericBench(GenericBenchEntry {
+                            const_value: Some(other), ..
+                        }),
+                    ..
+                },
+            ) => this.cmp_name(other),
+
+            _ => natural_cmp(self.display_name(), other.display_name()),
+        }
+    }
+
+    fn children(&self) -> &[Self] {
+        match self {
+            Self::Leaf { .. } => &[],
+            Self::Parent { children, .. } => children,
+        }
+    }
+
+    fn arg_names(&self) -> Option<&[&'static &'static str]> {
+        match self {
+            Self::Leaf { args, .. } => args.as_deref(),
+            Self::Parent { .. } => None,
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/lib.rs b/crates/divan_compat/divan_fork/src/lib.rs
new file mode 100644
index 00000000..7eaa96dd
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/lib.rs
@@ -0,0 +1,1321 @@
+//! [bench_attr]: macro@bench
+//! [bench_attr_examples]: macro@bench#examples
+//! [bench_attr_threads]: macro@bench#threads
+#![doc = include_str!("../README.md")]
+#![warn(missing_docs)]
+#![allow(
+    unknown_lints,
+    unused_unsafe,
+    clippy::needless_doctest_main,
+    clippy::needless_lifetimes,
+    clippy::new_without_default,
+    clippy::type_complexity,
+    clippy::missing_transmute_annotations
+)]
+
+// Used by generated code. Not public API and thus not subject to SemVer.
+#[doc(hidden)]
+#[path = "private.rs"]
+pub mod __private;
+
+mod alloc;
+mod bench;
+mod cli;
+mod compile_fail;
+mod config;
+mod divan;
+mod entry;
+mod stats;
+mod thread_pool;
+mod time;
+mod tree_painter;
+mod util;
+
+pub mod counter;
+
+/// Prevents compiler optimizations on a value.
+///
+/// `black_box` should only be used on [inputs](#benchmark-inputs) and
+/// [outputs](#benchmark-outputs) of benchmarks. Newcomers to benchmarking may
+/// be tempted to also use `black_box` within the implementation, but doing so
+/// will overly pessimize the measured code without any benefit.
+///
+/// ## Benchmark Inputs
+///
+/// When benchmarking, it's good practice to ensure measurements are accurate by
+/// preventing the compiler from optimizing based on assumptions about benchmark
+/// inputs.
+///
+/// The compiler can optimize code for indices it knows about, such as by
+/// removing bounds checks or unrolling loops. If real-world use of your code
+/// would not know indices up front, consider preventing optimizations on them
+/// in benchmarks:
+///
+/// ```
+/// use divan::black_box;
+///
+/// const INDEX: usize = // ...
+/// # 0;
+/// const SLICE: &[u8] = // ...
+/// # &[];
+///
+/// #[divan::bench]
+/// fn bench() {
+///     # fn work<T>(_: T) {}
+///     work(&SLICE[black_box(INDEX)..]);
+/// }
+/// ```
+///
+/// The compiler may also optimize for the data itself, which can also be
+/// avoided with `black_box`:
+///
+/// ```
+/// # use divan::black_box;
+/// # const INDEX: usize = 0;
+/// # const SLICE: &[u8] = &[];
+/// #[divan::bench]
+/// fn bench() {
+///     # fn work<T>(_: T) {}
+///     work(black_box(&SLICE[black_box(INDEX)..]));
+/// }
+/// ```
+///
+/// ## Benchmark Outputs
+///
+/// When benchmarking, it's best to ensure that all of the code is actually
+/// being run. If the compiler knows an output is unused, it may remove the code
+/// that generated the output. This optimization can make benchmarks appear much
+/// faster than they really are.
+///
+/// At the end of a benchmark, we can force the compiler to treat outputs as if
+/// they were actually used:
+///
+/// ```
+/// # use divan::black_box;
+/// #[divan::bench]
+/// fn bench() {
+///     # let value = 1;
+///     black_box(value.to_string());
+/// }
+/// ```
+///
+/// To make the code clearer to readers that the output is discarded, this code
+/// could instead call [`black_box_drop`].
+///
+/// Alternatively, the output can be returned from the benchmark:
+///
+/// ```
+/// #[divan::bench]
+/// fn bench() -> String {
+///     # let value = 1;
+///     value.to_string()
+/// }
+/// ```
+///
+/// Returning the output will `black_box` it and also avoid measuring the time
+/// to [drop](Drop) the output, which in this case is the time to deallocate a
+/// [`String`]. Read more about this in the [`#[divan::bench]`
+/// docs](macro@bench#drop).
+///
+/// ---
+///
+/// <h1>Standard Library Documentation</h1>
+///
+#[doc(inline)]
+pub use std::hint::black_box;
+
+#[doc(inline)]
+pub use crate::{alloc::AllocProfiler, bench::Bencher, divan::Divan};
+
+/// Runs all registered benchmarks.
+///
+/// # Examples
+///
+/// ```
+/// #[divan::bench]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+///
+/// fn main() {
+///     // Run `add` benchmark:
+///     divan::main();
+/// }
+/// ```
+///
+/// See [`#[divan::bench]`](macro@bench) for more examples.
+pub fn main() {
+    Divan::from_args().main();
+}
+
+/// [`black_box`] + [`drop`] convenience function.
+///
+/// # Examples
+///
+/// This is useful when benchmarking a lazy [`Iterator`] to completion with
+/// [`for_each`](Iterator::for_each):
+///
+/// ```
+/// #[divan::bench]
+/// fn parse_iter() {
+///     let input: &str = // ...
+///     # "";
+///
+///     # struct Parser;
+///     # impl Parser {
+///     #   fn new(_: &str) -> Parser { Parser }
+///     #   fn for_each(self, _: fn(&'static str)) {}
+///     # }
+///     Parser::new(input)
+///         .for_each(divan::black_box_drop);
+/// }
+/// ```
+#[inline]
+pub fn black_box_drop<T>(dummy: T) {
+    _ = black_box(dummy);
+}
+
+/// Registers a benchmarking function.
+///
+/// # Examples
+///
+/// The quickest way to get started is to benchmark the function as-is:
+///
+/// ```
+/// use divan::black_box;
+///
+/// #[divan::bench]
+/// fn add() -> i32 {
+///     black_box(1) + black_box(42)
+/// }
+///
+/// fn main() {
+///     // Run `add` benchmark:
+///     divan::main();
+/// }
+/// ```
+///
+/// If benchmarks need to setup context before running, they can take a
+/// [`Bencher`] and use [`Bencher::bench`]:
+///
+/// ```
+/// use divan::{Bencher, black_box};
+///
+/// #[divan::bench]
+/// fn copy_from_slice(bencher: Bencher) {
+///     let src = (0..100).collect::<Vec<i32>>();
+///     let mut dst = vec![0; src.len()];
+///
+///     bencher.bench_local(move || {
+///         black_box(&mut dst).copy_from_slice(black_box(&src));
+///     });
+/// }
+/// ```
+///
+/// Applying this attribute multiple times to the same item will cause a compile
+/// error:
+///
+/// ```compile_fail
+/// #[divan::bench]
+/// #[divan::bench]
+/// fn bench() {
+///     // ...
+/// }
+/// ```
+///
+/// # Drop
+///
+/// When a benchmarked function returns a value, it will not be [dropped][Drop]
+/// until after the current sample loop is finished. This allows for more
+/// precise timing measurements.
+///
+/// Note that there is an inherent memory cost to defer drop, including
+/// allocations inside not-yet-dropped values. Also, if the benchmark
+/// [panics](macro@std::panic), the values will never be dropped.
+///
+/// The following example benchmarks will only measure [`String`] construction
+/// time, but not deallocation time:
+///
+/// ```
+/// use divan::{Bencher, black_box};
+///
+/// #[divan::bench]
+/// fn freestanding() -> String {
+///     black_box("hello").to_uppercase()
+/// }
+///
+/// #[divan::bench]
+/// fn contextual(bencher: Bencher) {
+///     // Setup:
+///     let s: String = // ...
+///     # String::new();
+///
+///     bencher.bench(|| -> String {
+///         black_box(&s).to_lowercase()
+///     });
+/// }
+/// ```
+///
+/// If the returned value *does not* need to be dropped, there is no memory
+/// cost. Because of this, the following example benchmarks are equivalent:
+///
+/// ```
+/// #[divan::bench]
+/// fn with_return() -> i32 {
+///     let n: i32 = // ...
+///     # 0;
+///     n
+/// }
+///
+/// #[divan::bench]
+/// fn without_return() {
+///     let n: i32 = // ...
+///     # 0;
+///     divan::black_box(n);
+/// }
+/// ```
+///
+/// # Options
+///
+/// - [`name`]
+/// - [`crate`]
+/// - [`args`]
+/// - [`consts`]
+/// - [`types`]
+/// - [`sample_count`]
+/// - [`sample_size`]
+/// - [`threads`]
+/// - [`counters`]
+///     - [`bytes_count`]
+///     - [`chars_count`]
+///     - [`items_count`]
+/// - [`min_time`]
+/// - [`max_time`]
+/// - [`skip_ext_time`]
+/// - [`ignore`]
+///
+/// ## `name`
+/// [`name`]: #name
+///
+/// By default, the benchmark uses the function's name. It can be overridden via
+/// the [`name`] option:
+///
+/// ```
+/// #[divan::bench(name = "my_add")]
+/// fn add() -> i32 {
+///     // Will appear as "crate_name::my_add".
+///     # 0
+/// }
+/// ```
+///
+/// ## `crate`
+/// [`crate`]: #crate
+///
+/// The path to the specific `divan` crate instance used by this macro's
+/// generated code can be specified via the [`crate`] option. This is applicable
+/// when using `divan` via a macro from your own crate.
+///
+/// ```
+/// extern crate divan as sofa;
+///
+/// #[::sofa::bench(crate = ::sofa)]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// ## `args`
+/// [`args`]: #args
+///
+/// Function arguments can be provided to benchmark the function over multiple
+/// cases. This is used for comparing across parameters like collection lengths
+/// and [`enum`](https://doc.rust-lang.org/std/keyword.enum.html) variants. If
+/// you are not comparing cases and just need to pass a value into the
+/// benchmark, instead consider passing local values into the [`Bencher::bench`]
+/// closure or use [`Bencher::with_inputs`] for many distinct values.
+///
+/// The following example benchmarks converting a [`Range`](std::ops::Range) to
+/// [`Vec`] over different lengths:
+///
+/// ```
+/// #[divan::bench(args = [1000, LEN, len()])]
+/// fn init_vec(len: usize) -> Vec<usize> {
+///     (0..len).collect()
+/// }
+///
+/// const LEN: usize = // ...
+/// # 0;
+///
+/// fn len() -> usize {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// The list of arguments can be shared across multiple benchmarks through an
+/// external [`Iterator`]:
+///
+/// ```
+/// const LENS: &[usize] = // ...
+/// # &[];
+///
+/// #[divan::bench(args = LENS)]
+/// fn bench_vec1(len: usize) -> Vec<usize> {
+///     // ...
+///     # vec![]
+/// }
+///
+/// #[divan::bench(args = LENS)]
+/// fn bench_vec2(len: usize) -> Vec<usize> {
+///     // ...
+///     # vec![]
+/// }
+/// ```
+///
+/// Unlike the [`consts`] option, any argument type is supported if it
+/// implements [`Any`], [`Copy`], [`Send`], [`Sync`], and [`ToString`] (or
+/// [`Debug`](std::fmt::Debug)):
+///
+/// ```
+/// #[derive(Clone, Copy, Debug)]
+/// enum Arg {
+///     A, B
+/// }
+///
+/// #[divan::bench(args = [Arg::A, Arg::B])]
+/// fn bench_args(arg: Arg) {
+///     // ...
+/// }
+/// ```
+///
+/// The argument type does not need to implement [`Copy`] if it is used through
+/// a reference:
+///
+/// ```
+/// #[derive(Debug)]
+/// enum Arg {
+///     A, B
+/// }
+///
+/// #[divan::bench(args = [Arg::A, Arg::B])]
+/// fn bench_args(arg: &Arg) {
+///     // ...
+/// }
+/// ```
+///
+/// For convenience, common string types are coerced to [`&str`](primitive@str):
+///
+/// ```
+/// fn strings() -> impl Iterator<Item = String> {
+///     // ...
+///     # [].into_iter()
+/// }
+///
+/// #[divan::bench(args = strings())]
+/// fn bench_strings(s: &str) {
+///     // ...
+/// }
+/// ```
+///
+/// Arguments can also be used with [`Bencher`]. This allows for generating
+/// inputs based on [`args`] values or providing throughput information via
+/// [`Counter`s](crate::counter::Counter):
+///
+/// ```
+/// # fn new_value<T>(v: T) -> T { v }
+/// # fn do_work<T>(_: T) {}
+/// use divan::Bencher;
+///
+/// #[divan::bench(args = [1, 2, 3])]
+/// fn bench(bencher: Bencher, len: usize) {
+///     let value = new_value(len);
+///
+///     bencher
+///         .counter(len)
+///         .bench(|| {
+///             do_work(value);
+///         });
+/// }
+/// ```
+///
+/// ## `consts`
+/// [`consts`]: #consts
+///
+/// Divan supports benchmarking functions with [`const`
+/// generics](https://doc.rust-lang.org/reference/items/generics.html#const-generics)
+/// via the [`consts`] option.
+///
+/// The following example benchmarks initialization of [`[i32; N]`](prim@array)
+/// for values of `N` provided by a [literal](https://doc.rust-lang.org/reference/expressions/literal-expr.html),
+/// [`const` item](https://doc.rust-lang.org/reference/items/constant-items.html),
+/// and [`const fn`](https://doc.rust-lang.org/reference/const_eval.html#const-functions):
+///
+/// ```
+/// #[divan::bench(consts = [1000, LEN, len()])]
+/// fn init_array<const N: usize>() -> [i32; N] {
+///     let mut result = [0; N];
+///
+///     for i in 0..N {
+///         result[i] = divan::black_box(i as i32);
+///     }
+///
+///     result
+/// }
+///
+/// const LEN: usize = // ...
+/// # 0;
+///
+/// const fn len() -> usize {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// The list of constants can be shared across multiple benchmarks through an
+/// external [array](prim@array) or [slice](prim@slice):
+///
+/// ```
+/// const SIZES: &[usize] = &[1, 2, 5, 10];
+///
+/// #[divan::bench(consts = SIZES)]
+/// fn bench_array1<const N: usize>() -> [i32; N] {
+///     // ...
+///     # [0; N]
+/// }
+///
+/// #[divan::bench(consts = SIZES)]
+/// fn bench_array2<const N: usize>() -> [i32; N] {
+///     // ...
+///     # [0; N]
+/// }
+/// ```
+///
+/// External constants are limited to lengths 1 through 20, because of
+/// implementation details. This limit does not apply if the list is provided
+/// directly like in the first example.
+///
+/// ```compile_fail
+/// const SIZES: [usize; 21] = [
+///     // ...
+///     # 0; 21
+/// ];
+///
+/// #[divan::bench(consts = SIZES)]
+/// fn bench_array<const N: usize>() -> [i32; N] {
+///     // ...
+///     # [0; N]
+/// }
+/// ```
+///
+/// ## `types`
+/// [`types`]: #types
+///
+/// Divan supports benchmarking generic functions over a list of types via the
+/// [`types`] option.
+///
+/// The following example benchmarks the [`From<&str>`](From) implementations
+/// for [`&str`](prim@str) and [`String`]:
+///
+/// ```
+/// #[divan::bench(types = [&str, String])]
+/// fn from_str<'a, T>() -> T
+/// where
+///     T: From<&'a str>,
+/// {
+///     divan::black_box("hello world").into()
+/// }
+/// ```
+///
+/// The [`types`] and [`args`] options can be combined to benchmark _T_ × _A_
+/// scenarios. The following example benchmarks the [`FromIterator`]
+/// implementations for [`Vec`], [`BTreeSet`], and [`HashSet`]:
+///
+/// ```
+/// use std::collections::{BTreeSet, HashSet};
+///
+/// #[divan::bench(
+///     types = [Vec<i32>, BTreeSet<i32>, HashSet<i32>],
+///     args = [0, 2, 4, 16, 256, 4096],
+/// )]
+/// fn from_range<T>(n: i32) -> T
+/// where
+///     T: FromIterator<i32>,
+/// {
+///     (0..n).collect()
+/// }
+/// ```
+///
+/// [`BTreeSet`]: std::collections::BTreeSet
+/// [`HashSet`]: std::collections::HashSet
+///
+/// ## `sample_count`
+/// [`sample_count`]: #sample_count
+///
+/// The number of statistical sample recordings can be set to a predetermined
+/// [`u32`] value via the [`sample_count`] option. This may be overridden at
+/// runtime using either the `DIVAN_SAMPLE_COUNT` environment variable or
+/// `--sample-count` CLI argument.
+///
+/// ```
+/// #[divan::bench(sample_count = 1000)]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// If the [`threads`] option is enabled, sample count becomes a multiple of the
+/// number of threads. This is because each thread operates over the same sample
+/// size to ensure there are always N competing threads doing the same amount of
+/// work.
+///
+/// ## `sample_size`
+/// [`sample_size`]: #sample_size
+///
+/// The number iterations within each statistics sample can be set to a
+/// predetermined [`u32`] value via the [`sample_size`] option. This may be
+/// overridden at runtime using either the `DIVAN_SAMPLE_SIZE` environment
+/// variable or `--sample-size` CLI argument.
+///
+/// ```
+/// #[divan::bench(sample_size = 1000)]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// ## `threads`
+/// [`threads`]: #threads
+///
+/// Benchmarked functions can be run across multiple threads via the [`threads`]
+/// option. This enables you to measure contention on [atomics and
+/// locks][std::sync]. The default thread count is the [available parallelism].
+///
+/// ```
+/// use std::sync::Arc;
+///
+/// #[divan::bench(threads)]
+/// fn arc_clone(bencher: divan::Bencher) {
+///     let arc = Arc::new(42);
+///
+///     bencher.bench(|| arc.clone());
+/// }
+/// ```
+///
+/// The [`threads`] option can be set to any of:
+/// - [`bool`] for [available parallelism] (true) or no parallelism.
+/// - [`usize`] for a specific number of threads. 0 means use [available
+///   parallelism] and 1 means no parallelism.
+/// - [`IntoIterator`] over [`usize`] for multiple thread counts, such as:
+///     - [`Range<usize>`](std::ops::Range)
+///     - [`[usize; N]`](prim@array)
+///     - [`&[usize]`](prim@slice)
+///
+/// ```
+/// #[divan::bench(threads = false)]
+/// fn single() {
+///     // ...
+/// }
+///
+/// #[divan::bench(threads = 10)]
+/// fn specific() {
+///     // ...
+/// }
+///
+/// #[divan::bench(threads = 0..=8)]
+/// fn range() {
+///     // Note: Includes 0 for available parallelism.
+/// }
+///
+/// #[divan::bench(threads = [0, 1, 4, 8, 16])]
+/// fn selection() {
+///     // ...
+/// }
+/// ```
+///
+/// ## `counters`
+/// [`counters`]: #counters
+///
+/// The [`Counter`s](crate::counter::Counter) of each iteration can be set via
+/// the [`counters`] option. The following example emits info for the number of
+/// bytes and number of ints processed when benchmarking [slice sorting](slice::sort):
+///
+/// ```
+/// use divan::{Bencher, counter::{BytesCount, ItemsCount}};
+///
+/// const INTS: &[i32] = &[
+///     // ...
+/// ];
+///
+/// #[divan::bench(counters = [
+///     BytesCount::of_slice(INTS),
+///     ItemsCount::new(INTS.len()),
+/// ])]
+/// fn sort(bencher: Bencher) {
+///     bencher
+///         .with_inputs(|| INTS.to_vec())
+///         .bench_refs(|ints| ints.sort());
+/// }
+/// ```
+///
+/// For convenience, singular `counter` allows a single
+/// [`Counter`](crate::counter::Counter) to be set. The following example emits
+/// info for the number of bytes processed when benchmarking
+/// [`char`-counting](std::str::Chars::count):
+///
+/// ```
+/// use divan::counter::BytesCount;
+///
+/// const STR: &str = "...";
+///
+/// #[divan::bench(counter = BytesCount::of_str(STR))]
+/// fn char_count() -> usize {
+///     divan::black_box(STR).chars().count()
+/// }
+/// ```
+///
+/// See:
+/// - [`#[divan::bench_group(counters = ...)]`](macro@bench_group#counters)
+/// - [`Bencher::counter`]
+/// - [`Bencher::input_counter`]
+///
+/// ### `bytes_count`
+/// [`bytes_count`]: #bytes_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [BytesCount](counter::BytesCount)::from(n)</code>.
+///
+/// ### `chars_count`
+/// [`chars_count`]: #chars_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [CharsCount](counter::CharsCount)::from(n)</code>.
+///
+/// ### `items_count`
+/// [`items_count`]: #items_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [ItemsCount](counter::ItemsCount)::from(n)</code>.
+///
+/// ## `min_time`
+/// [`min_time`]: #min_time
+///
+/// The minimum time spent benchmarking each function can be set to a
+/// predetermined [`Duration`] via the [`min_time`] option. This may be
+/// overridden at runtime using either the `DIVAN_MIN_TIME` environment variable
+/// or `--min-time` CLI argument.
+///
+/// Unless [`skip_ext_time`] is set, this includes time external to the
+/// benchmarked function, such as time spent generating inputs and running
+/// [`Drop`].
+///
+/// ```
+/// use std::time::Duration;
+///
+/// #[divan::bench(min_time = Duration::from_secs(3))]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// For convenience, [`min_time`] can also be set with seconds as [`u64`] or
+/// [`f64`]. Invalid values will cause a panic at runtime.
+///
+/// ```
+/// #[divan::bench(min_time = 2)]
+/// fn int_secs() -> i32 {
+///     // ...
+///     # 0
+/// }
+///
+/// #[divan::bench(min_time = 1.5)]
+/// fn float_secs() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// ## `max_time`
+/// [`max_time`]: #max_time
+///
+/// The maximum time spent benchmarking each function can be set to a
+/// predetermined [`Duration`] via the [`max_time`] option. This may be
+/// overridden at runtime using either the `DIVAN_MAX_TIME` environment variable
+/// or `--max-time` CLI argument.
+///
+/// Unless [`skip_ext_time`] is set, this includes time external to the
+/// benchmarked function, such as time spent generating inputs and running
+/// [`Drop`].
+///
+/// If `min_time > max_time`, then [`max_time`] has priority and [`min_time`]
+/// will not be reached.
+///
+/// ```
+/// use std::time::Duration;
+///
+/// #[divan::bench(max_time = Duration::from_secs(5))]
+/// fn add() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// For convenience, like [`min_time`], [`max_time`] can also be set with
+/// seconds as [`u64`] or [`f64`]. Invalid values will cause a panic at runtime.
+///
+/// ```
+/// #[divan::bench(max_time = 8)]
+/// fn int_secs() -> i32 {
+///     // ...
+///     # 0
+/// }
+///
+/// #[divan::bench(max_time = 9.5)]
+/// fn float_secs() -> i32 {
+///     // ...
+///     # 0
+/// }
+/// ```
+///
+/// ## `skip_ext_time`
+/// [`skip_ext_time`]: #skip_ext_time
+///
+/// By default, [`min_time`] and [`max_time`] include time external to the
+/// benchmarked function, such as time spent generating inputs and running
+/// [`Drop`]. Enabling the [`skip_ext_time`] option will instead make those
+/// options only consider time spent within the benchmarked function. This may
+/// be overridden at runtime using either the `DIVAN_SKIP_EXT_TIME` environment
+/// variable or `--skip-ext-time` CLI argument.
+///
+/// In the following example, [`max_time`] only considers time spent running
+/// `measured_function`:
+///
+/// ```
+/// # fn generate_input() {}
+/// # fn measured_function(_: ()) {}
+/// #[divan::bench(max_time = 5, skip_ext_time)]
+/// fn bench(bencher: divan::Bencher) {
+///     bencher
+///         .with_inputs(|| generate_input())
+///         .bench_values(|input| measured_function(input));
+/// }
+/// ```
+///
+/// This option can be set to an explicit [`bool`] value to override parent
+/// values:
+///
+/// ```
+/// #[divan::bench(max_time = 5, skip_ext_time = false)]
+/// fn bench(bencher: divan::Bencher) {
+///     // ...
+/// }
+/// ```
+///
+/// ## `ignore`
+/// [`ignore`]: #ignore
+///
+/// Like [`#[test]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-test-attribute),
+/// `#[divan::bench]` functions can use [`#[ignore]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-ignore-attribute):
+///
+/// ```
+/// #[divan::bench]
+/// #[ignore]
+/// fn todo() {
+///     unimplemented!();
+/// }
+/// # divan::main();
+/// ```
+///
+/// This option can also instead be set within the `#[divan::bench]` attribute:
+///
+/// ```
+/// #[divan::bench(ignore)]
+/// fn todo() {
+///     unimplemented!();
+/// }
+/// # divan::main();
+/// ```
+///
+/// Like [`skip_ext_time`], this option can be set to an explicit [`bool`] value
+/// to override parent values:
+///
+/// ```
+/// #[divan::bench(ignore = false)]
+/// fn bench() {
+///     // ...
+/// }
+/// ```
+///
+/// This can be used to ignore benchmarks based on a runtime condition. The
+/// following example benchmark will be ignored if an [environment
+/// variable](std::env::var) is not set to "true":
+///
+/// ```
+/// #[divan::bench(
+///     ignore = std::env::var("BENCH_EXPENSIVE").as_deref() != Ok("true")
+/// )]
+/// fn expensive_bench() {
+///     // ...
+/// }
+/// ```
+///
+/// [`Any`]: std::any::Any
+/// [`Duration`]: std::time::Duration
+/// [available parallelism]: std::thread::available_parallelism
+pub use divan_macros::bench;
+
+/// Registers a benchmarking group.
+///
+/// # Examples
+///
+/// This is used for setting [options] shared across
+/// [`#[divan::bench]`](macro@bench) functions in the same module:
+///
+/// ```
+/// #[divan::bench_group(
+///     sample_count = 100,
+///     sample_size = 500,
+/// )]
+/// mod math {
+///     use divan::black_box;
+///
+///     #[divan::bench]
+///     fn add() -> i32 {
+///         black_box(1) + black_box(42)
+///     }
+///
+///     #[divan::bench]
+///     fn div() -> i32 {
+///         black_box(1) / black_box(42)
+///     }
+/// }
+///
+/// fn main() {
+///     // Run `math::add` and `math::div` benchmarks:
+///     divan::main();
+/// }
+/// ```
+///
+/// Benchmarking [options] set on parent groups cascade into child groups and
+/// their benchmarks:
+///
+/// ```
+/// #[divan::bench_group(
+///     sample_count = 100,
+///     sample_size = 500,
+/// )]
+/// mod parent {
+///     #[divan::bench_group(sample_size = 1)]
+///     mod child1 {
+///         #[divan::bench]
+///         fn bench() {
+///             // Will be sampled 100 times with 1 iteration per sample.
+///         }
+///     }
+///
+///     #[divan::bench_group(sample_count = 42)]
+///     mod child2 {
+///         #[divan::bench]
+///         fn bench() {
+///             // Will be sampled 42 times with 500 iterations per sample.
+///         }
+///     }
+///
+///     mod child3 {
+///         #[divan::bench(sample_count = 1)]
+///         fn bench() {
+///             // Will be sampled 1 time with 500 iterations per sample.
+///         }
+///     }
+/// }
+/// ```
+///
+/// Applying this attribute multiple times to the same item will cause a compile
+/// error:
+///
+/// ```compile_fail
+/// #[divan::bench_group]
+/// #[divan::bench_group]
+/// mod math {
+///     // ...
+/// }
+/// ```
+///
+/// # Options
+/// [options]: #options
+///
+/// - [`name`]
+/// - [`crate`]
+/// - [`sample_count`]
+/// - [`sample_size`]
+/// - [`threads`]
+/// - [`counters`]
+///     - [`bytes_count`]
+///     - [`chars_count`]
+///     - [`items_count`]
+/// - [`min_time`]
+/// - [`max_time`]
+/// - [`skip_ext_time`]
+/// - [`ignore`]
+///
+/// ## `name`
+/// [`name`]: #name
+///
+/// By default, the benchmark group uses the module's name. It can be overridden
+/// via the `name` option:
+///
+/// ```
+/// #[divan::bench_group(name = "my_math")]
+/// mod math {
+///     #[divan::bench(name = "my_add")]
+///     fn add() -> i32 {
+///         // Will appear as "crate_name::my_math::my_add".
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// ## `crate`
+/// [`crate`]: #crate
+///
+/// The path to the specific `divan` crate instance used by this macro's
+/// generated code can be specified via the [`crate`] option. This is applicable
+/// when using `divan` via a macro from your own crate.
+///
+/// ```
+/// extern crate divan as sofa;
+///
+/// #[::sofa::bench_group(crate = ::sofa)]
+/// mod math {
+///     #[::sofa::bench(crate = ::sofa)]
+///     fn add() -> i32 {
+///         // ...
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// ## `sample_count`
+/// [`sample_count`]: #sample_count
+///
+/// The number of statistical sample recordings can be set to a predetermined
+/// [`u32`] value via the [`sample_count`] option. This may be overridden at
+/// runtime using either the `DIVAN_SAMPLE_COUNT` environment variable or
+/// `--sample-count` CLI argument.
+///
+/// ```
+/// #[divan::bench_group(sample_count = 1000)]
+/// mod math {
+///     #[divan::bench]
+///     fn add() -> i32 {
+///         // ...
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// If the [`threads`] option is enabled, sample count becomes a multiple of the
+/// number of threads. This is because each thread operates over the same sample
+/// size to ensure there are always N competing threads doing the same amount of
+/// work.
+///
+/// ## `sample_size`
+/// [`sample_size`]: #sample_size
+///
+/// The number iterations within each statistical sample can be set to a
+/// predetermined [`u32`] value via the [`sample_size`] option. This may be
+/// overridden at runtime using either the `DIVAN_SAMPLE_SIZE` environment
+/// variable or `--sample-size` CLI argument.
+///
+/// ```
+/// #[divan::bench_group(sample_size = 1000)]
+/// mod math {
+///     #[divan::bench]
+///     fn add() -> i32 {
+///         // ...
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// ## `threads`
+/// [`threads`]: #threads
+///
+/// See [`#[divan::bench(threads = ...)]`](macro@bench#threads).
+///
+/// ## `counters`
+/// [`counters`]: #counters
+///
+/// The [`Counter`s](crate::counter::Counter) of each iteration of benchmarked
+/// functions in a group can be set via the [`counters`] option. The following
+/// example emits info for the number of bytes and number of ints processed when
+/// benchmarking [slice sorting](slice::sort):
+///
+/// ```
+/// use divan::{Bencher, counter::{BytesCount, ItemsCount}};
+///
+/// const INTS: &[i32] = &[
+///     // ...
+/// ];
+///
+/// #[divan::bench_group(counters = [
+///     BytesCount::of_slice(INTS),
+///     ItemsCount::new(INTS.len()),
+/// ])]
+/// mod sort {
+///     use super::*;
+///
+///     #[divan::bench]
+///     fn default(bencher: Bencher) {
+///         bencher
+///             .with_inputs(|| INTS.to_vec())
+///             .bench_refs(|ints| ints.sort());
+///     }
+///
+///     #[divan::bench]
+///     fn unstable(bencher: Bencher) {
+///         bencher
+///             .with_inputs(|| INTS.to_vec())
+///             .bench_refs(|ints| ints.sort_unstable());
+///     }
+/// }
+/// # fn main() {}
+/// ```
+///
+/// For convenience, singular `counter` allows a single
+/// [`Counter`](crate::counter::Counter) to be set. The following example emits
+/// info for the number of bytes processed when benchmarking
+/// [`char`-counting](std::str::Chars::count) and
+/// [`char`-collecting](std::str::Chars::collect):
+///
+/// ```
+/// use divan::counter::BytesCount;
+///
+/// const STR: &str = "...";
+///
+/// #[divan::bench_group(counter = BytesCount::of_str(STR))]
+/// mod chars {
+///     use super::STR;
+///
+///     #[divan::bench]
+///     fn count() -> usize {
+///         divan::black_box(STR).chars().count()
+///     }
+///
+///     #[divan::bench]
+///     fn collect() -> String {
+///         divan::black_box(STR).chars().collect()
+///     }
+/// }
+/// # fn main() {}
+/// ```
+///
+/// See:
+/// - [`#[divan::bench(counters = ...)]`](macro@bench#counters)
+/// - [`Bencher::counter`]
+/// - [`Bencher::input_counter`]
+///
+/// ### `bytes_count`
+/// [`bytes_count`]: #bytes_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [BytesCount](counter::BytesCount)::from(n)</code>.
+///
+/// ### `chars_count`
+/// [`chars_count`]: #chars_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [CharsCount](counter::CharsCount)::from(n)</code>.
+///
+/// ### `cycles_count`
+/// [`cycles_count`]: #cycles_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [CyclesCount](counter::CyclesCount)::from(n)</code>.
+///
+/// ### `items_count`
+/// [`items_count`]: #items_count
+///
+/// Convenience shorthand for
+/// <code>[counter](#counters) = [ItemsCount](counter::ItemsCount)::from(n)</code>.
+///
+/// ## `min_time`
+/// [`min_time`]: #min_time
+///
+/// The minimum time spent benchmarking each function can be set to a
+/// predetermined [`Duration`] via the [`min_time`] option. This may be
+/// overridden at runtime using either the `DIVAN_MIN_TIME` environment variable
+/// or `--min-time` CLI argument.
+///
+/// Unless [`skip_ext_time`] is set, this includes time external to benchmarked
+/// functions, such as time spent generating inputs and running [`Drop`].
+///
+/// ```
+/// use std::time::Duration;
+///
+/// #[divan::bench_group(min_time = Duration::from_secs(3))]
+/// mod math {
+///     #[divan::bench]
+///     fn add() -> i32 {
+///         // ...
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// For convenience, [`min_time`] can also be set with seconds as [`u64`] or
+/// [`f64`]. Invalid values will cause a panic at runtime.
+///
+/// ```
+/// #[divan::bench_group(min_time = 2)]
+/// mod int_secs {
+///     // ...
+/// }
+///
+/// #[divan::bench_group(min_time = 1.5)]
+/// mod float_secs {
+///     // ...
+/// }
+/// ```
+///
+/// ## `max_time`
+/// [`max_time`]: #max_time
+///
+/// The maximum time spent benchmarking each function can be set to a
+/// predetermined [`Duration`] via the [`max_time`] option. This may be
+/// overridden at runtime using either the `DIVAN_MAX_TIME` environment variable
+/// or `--max-time` CLI argument.
+///
+/// Unless [`skip_ext_time`] is set, this includes time external to benchmarked
+/// functions, such as time spent generating inputs and running [`Drop`].
+///
+/// If `min_time > max_time`, then [`max_time`] has priority and [`min_time`]
+/// will not be reached.
+///
+/// ```
+/// use std::time::Duration;
+///
+/// #[divan::bench_group(max_time = Duration::from_secs(5))]
+/// mod math {
+///     #[divan::bench]
+///     fn add() -> i32 {
+///         // ...
+///         # 0
+///     }
+/// }
+/// ```
+///
+/// For convenience, like [`min_time`], [`max_time`] can also be set with
+/// seconds as [`u64`] or [`f64`]. Invalid values will cause a panic at runtime.
+///
+/// ```
+/// #[divan::bench_group(max_time = 8)]
+/// mod int_secs {
+///     // ...
+/// }
+///
+/// #[divan::bench_group(max_time = 9.5)]
+/// mod float_secs {
+///     // ...
+/// }
+/// ```
+///
+/// ## `skip_ext_time`
+/// [`skip_ext_time`]: #skip_ext_time
+///
+/// By default, [`min_time`] and [`max_time`] include time external to
+/// benchmarked functions, such as time spent generating inputs and running
+/// [`Drop`]. Enabling the [`skip_ext_time`] option will instead make those
+/// options only consider time spent within benchmarked functions. This may be
+/// overridden at runtime using either the `DIVAN_SKIP_EXT_TIME` environment
+/// variable or `--skip-ext-time` CLI argument.
+///
+/// In the following example, [`max_time`] only considers time spent running
+/// `measured_function`:
+///
+/// ```
+/// #[divan::bench_group(skip_ext_time)]
+/// mod group {
+///     # fn generate_input() {}
+///     # fn measured_function(_: ()) {}
+///     #[divan::bench(max_time = 5)]
+///     fn bench(bencher: divan::Bencher) {
+///         bencher
+///             .with_inputs(|| generate_input())
+///             .bench_values(|input| measured_function(input));
+///     }
+/// }
+/// ```
+///
+/// This option can be set to an explicit [`bool`] value to override parent
+/// values:
+///
+/// ```
+/// #[divan::bench_group(skip_ext_time = false)]
+/// mod group {
+///     // ...
+/// }
+/// ```
+///
+/// ## `ignore`
+/// [`ignore`]: #ignore
+///
+/// Like [`#[test]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-test-attribute)
+/// and [`#[divan::bench]`](macro@bench), `#[divan::bench_group]` functions can
+/// use [`#[ignore]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-ignore-attribute):
+///
+/// ```
+/// #[divan::bench_group]
+/// #[ignore]
+/// mod math {
+///     #[divan::bench]
+///     fn todo() {
+///         unimplemented!();
+///     }
+/// }
+/// # divan::main();
+/// ```
+///
+/// This option can also instead be set within the `#[divan::bench_group]`
+/// attribute:
+///
+/// ```
+/// #[divan::bench_group(ignore)]
+/// mod math {
+///     #[divan::bench]
+///     fn todo() {
+///         unimplemented!();
+///     }
+/// }
+/// # divan::main();
+/// ```
+///
+/// Like [`skip_ext_time`], this option can be set to an explicit [`bool`] value
+/// to override parent values:
+///
+/// ```
+/// #[divan::bench_group(ignore = false)]
+/// mod group {
+///     // ...
+/// }
+/// ```
+///
+/// This can be used to ignore benchmarks based on a runtime condition. The
+/// following example benchmark group will be ignored if an [environment
+/// variable](std::env::var) is not set to "true":
+///
+/// ```
+/// #[divan::bench_group(
+///     ignore = std::env::var("BENCH_EXPENSIVE").as_deref() != Ok("true")
+/// )]
+/// mod expensive_benches {
+///     // ...
+/// }
+/// ```
+///
+/// [`Duration`]: std::time::Duration
+pub use divan_macros::bench_group;
diff --git a/crates/divan_compat/divan_fork/src/private.rs b/crates/divan_compat/divan_fork/src/private.rs
new file mode 100644
index 00000000..08cbd17e
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/private.rs
@@ -0,0 +1,229 @@
+use std::{
+    borrow::{Borrow, Cow},
+    fmt::Debug,
+};
+
+pub use crate::{
+    bench::{BenchArgs, BenchOptions},
+    entry::{
+        BenchEntry, BenchEntryRunner, EntryConst, EntryList, EntryLocation, EntryMeta, EntryType,
+        GenericBenchEntry, GroupEntry, BENCH_ENTRIES, GROUP_ENTRIES,
+    },
+    time::IntoDuration,
+};
+
+/// Helper to convert values to strings via `ToString` or fallback to `Debug`.
+///
+/// This works by having a `Debug`-based `ToString::to_string` method that will
+/// be chosen if the wrapped type implements `Debug` *but not* `ToString`. If
+/// the wrapped type implements `ToString`, then the inherent
+/// `ToStringHelper::to_string` method will be chosen instead.
+pub struct ToStringHelper<'a, T: 'static>(pub &'a T);
+
+#[allow(clippy::to_string_trait_impl)]
+impl<T: Debug> ToString for ToStringHelper<'_, T> {
+    #[inline]
+    fn to_string(&self) -> String {
+        format!("{:?}", self.0)
+    }
+}
+
+impl<T: ToString> ToStringHelper<'_, T> {
+    #[allow(clippy::inherent_to_string)]
+    #[inline]
+    pub fn to_string(&self) -> String {
+        self.0.to_string()
+    }
+}
+
+/// Used by `#[divan::bench(args = ...)]` to enable polymorphism.
+pub trait Arg<T> {
+    fn get(this: Self) -> T;
+}
+
+impl<T> Arg<T> for T {
+    #[inline]
+    fn get(this: Self) -> T {
+        this
+    }
+}
+
+impl<'a, T: ?Sized> Arg<&'a T> for &'a Cow<'a, T>
+where
+    T: ToOwned,
+{
+    #[inline]
+    fn get(this: Self) -> &'a T {
+        this
+    }
+}
+
+impl<'a> Arg<&'a str> for &'a String {
+    #[inline]
+    fn get(this: Self) -> &'a str {
+        this
+    }
+}
+
+impl<T: Copy> Arg<T> for &T {
+    #[inline]
+    fn get(this: Self) -> T {
+        *this
+    }
+}
+
+impl<T: Copy> Arg<T> for &&T {
+    #[inline]
+    fn get(this: Self) -> T {
+        **this
+    }
+}
+
+impl<T: Copy> Arg<T> for &&&T {
+    #[inline]
+    fn get(this: Self) -> T {
+        ***this
+    }
+}
+
+/// Used by `#[divan::bench(threads = ...)]` to leak thread counts for easy
+/// global usage in [`BenchOptions::threads`].
+///
+/// This enables the `threads` option to be polymorphic over:
+/// - `usize`
+/// - `bool`
+///     - `true` is 0
+///     - `false` is 1
+/// - Iterators:
+///     - `[usize; N]`
+///     - `&[usize; N]`
+///     - `&[usize]`
+///
+/// # Orphan Rules Hack
+///
+/// Normally we can't implement a trait over both `usize` and `I: IntoIterator`
+/// because the compiler has no guarantee that `usize` will never implement
+/// `IntoIterator`. Ideally we would handle this with specialization, but that's
+/// not stable.
+///
+/// The solution here is to make `IntoThreads` generic to implement technically
+/// different traits for `usize` and `IntoIterator` because of different `IMP`
+/// values. We then call verbatim `IntoThreads::into_threads(val)` and have the
+/// compiler infer the generic parameter for the single `IntoThreads`
+/// implementation.
+///
+/// It's fair to assume that scalar primitives will never implement
+/// `IntoIterator`, so this hack shouldn't break in the future 🤠.
+pub trait IntoThreads<const IMP: u32> {
+    fn into_threads(self) -> Cow<'static, [usize]>;
+}
+
+impl IntoThreads<0> for usize {
+    #[inline]
+    fn into_threads(self) -> Cow<'static, [usize]> {
+        let counts = match self {
+            0 => &[0],
+            1 => &[1],
+            2 => &[2],
+            _ => return Cow::Owned(vec![self]),
+        };
+        Cow::Borrowed(counts)
+    }
+}
+
+impl IntoThreads<0> for bool {
+    #[inline]
+    fn into_threads(self) -> Cow<'static, [usize]> {
+        let counts = if self {
+            // Available parallelism.
+            &[0]
+        } else {
+            // No parallelism.
+            &[1]
+        };
+        Cow::Borrowed(counts)
+    }
+}
+
+impl<I> IntoThreads<1> for I
+where
+    I: IntoIterator,
+    I::Item: Borrow<usize>,
+{
+    #[inline]
+    fn into_threads(self) -> Cow<'static, [usize]> {
+        let mut options: Vec<usize> = self.into_iter().map(|i| *i.borrow()).collect();
+        options.sort_unstable();
+        options.dedup();
+        Cow::Owned(options)
+    }
+}
+
+/// Used by `#[divan::bench(counters = [...])]`.
+#[inline]
+pub fn new_counter_set() -> crate::counter::CounterSet {
+    Default::default()
+}
+
+/// Used by `#[divan::bench]` to truncate arrays for generic `const` benchmarks.
+pub const fn shrink_array<T, const IN: usize, const OUT: usize>(
+    array: [T; IN],
+) -> Option<[T; OUT]> {
+    use std::mem::ManuallyDrop;
+
+    #[repr(C)]
+    union Transmute<F, I> {
+        from: ManuallyDrop<F>,
+        into: ManuallyDrop<I>,
+    }
+
+    let from = ManuallyDrop::new(array);
+
+    if OUT <= IN {
+        Some(unsafe { ManuallyDrop::into_inner(Transmute { from }.into) })
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn into_threads() {
+        macro_rules! test {
+            ($value:expr, $expected:expr) => {
+                assert_eq!(IntoThreads::into_threads($value).as_ref(), $expected);
+            };
+        }
+
+        test!(true, &[0]);
+        test!(false, &[1]);
+
+        test!(0, &[0]);
+        test!(1, &[1]);
+        test!(42, &[42]);
+
+        // test!([0; 0], &[]);
+        test!([0], &[0]);
+        test!([0, 0], &[0]);
+
+        test!([0, 2, 3, 1], &[0, 1, 2, 3]);
+        test!([0, 0, 2, 3, 2, 1, 3], &[0, 1, 2, 3]);
+    }
+
+    #[test]
+    fn shrink_array() {
+        let values = [1, 2, 3, 4, 5];
+
+        let equal: Option<[i32; 5]> = super::shrink_array(values);
+        assert_eq!(equal, Some(values));
+
+        let smaller: Option<[i32; 3]> = super::shrink_array(values);
+        assert_eq!(smaller, Some([1, 2, 3]));
+
+        let larger: Option<[i32; 100]> = super::shrink_array(values);
+        assert_eq!(larger, None);
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/stats/mod.rs b/crates/divan_compat/divan_fork/src/stats/mod.rs
new file mode 100644
index 00000000..39d0d759
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/stats/mod.rs
@@ -0,0 +1,61 @@
+//! Measurement statistics.
+
+use crate::{
+    alloc::{AllocOpMap, AllocTally},
+    counter::{KnownCounterKind, MaxCountUInt},
+    time::FineDuration,
+};
+
+mod sample;
+
+pub(crate) use sample::*;
+
+/// Statistics from samples.
+pub(crate) struct Stats {
+    /// Total number of samples taken.
+    pub sample_count: u32,
+
+    /// Total number of iterations (currently `sample_count * `sample_size`).
+    pub iter_count: u64,
+
+    /// Timing statistics.
+    pub time: StatsSet<FineDuration>,
+
+    /// Maximum allocated bytes and maximum number of allocations associated
+    /// with the corresponding samples for `time`.
+    pub max_alloc: AllocTally<StatsSet<f64>>,
+
+    /// Allocation statistics associated with the corresponding samples for
+    /// `time`.
+    pub alloc_tallies: AllocOpMap<AllocTally<StatsSet<f64>>>,
+
+    /// `Counter` counts associated with the corresponding samples for `time`.
+    pub counts: [Option<StatsSet<MaxCountUInt>>; KnownCounterKind::COUNT],
+}
+
+impl Stats {
+    pub fn get_counts(&self, counter_kind: KnownCounterKind) -> Option<&StatsSet<MaxCountUInt>> {
+        self.counts[counter_kind as usize].as_ref()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct StatsSet<T> {
+    /// Associated with minimum amount of time taken by an iteration.
+    pub fastest: T,
+
+    /// Associated with maximum amount of time taken by an iteration.
+    pub slowest: T,
+
+    /// Associated with midpoint time taken by an iteration.
+    pub median: T,
+
+    /// Associated with average time taken by all iterations.
+    pub mean: T,
+}
+
+impl StatsSet<f64> {
+    pub fn is_zero(&self) -> bool {
+        self.fastest == 0.0 && self.slowest == 0.0 && self.median == 0.0 && self.mean == 0.0
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/stats/sample.rs b/crates/divan_compat/divan_fork/src/stats/sample.rs
new file mode 100644
index 00000000..b1e1727d
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/stats/sample.rs
@@ -0,0 +1,80 @@
+use std::collections::HashMap;
+
+use crate::{
+    alloc::ThreadAllocInfo,
+    counter::KnownCounterKind,
+    time::{FineDuration, Timer, Timestamp},
+};
+
+/// Timing measurement.
+pub(crate) struct TimeSample {
+    /// The time this sample took to run.
+    ///
+    /// This is gotten from [`RawSample`] with:
+    /// `end.duration_since(start, timer).clamp_to(timer.precision())`.
+    pub duration: FineDuration,
+}
+
+/// Unprocessed measurement.
+///
+/// This cannot be serialized because [`Timestamp`] is an implementation detail
+/// for both the `Instant` and TSC timers.
+pub(crate) struct RawSample {
+    pub start: Timestamp,
+    pub end: Timestamp,
+    pub timer: Timer,
+    pub alloc_info: ThreadAllocInfo,
+    pub counter_totals: [u128; KnownCounterKind::COUNT],
+}
+
+impl RawSample {
+    /// Simply computes `end - start` without clamping to precision.
+    #[inline]
+    pub fn duration(&self) -> FineDuration {
+        self.end.duration_since(self.start, self.timer)
+    }
+}
+
+/// Sample collection.
+#[derive(Default)]
+pub(crate) struct SampleCollection {
+    /// The number of iterations within each sample.
+    pub sample_size: u32,
+
+    /// Collected timings.
+    pub time_samples: Vec<TimeSample>,
+
+    /// Allocation information associated with `time_samples` by index.
+    pub alloc_info_by_sample: HashMap<u32, ThreadAllocInfo>,
+}
+
+impl SampleCollection {
+    /// Discards all recorded data.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.time_samples.clear();
+        self.alloc_info_by_sample.clear();
+    }
+
+    /// Computes the total number of iterations across all samples.
+    ///
+    /// We use `u64` in case sample count and sizes are huge.
+    #[inline]
+    pub fn iter_count(&self) -> u64 {
+        self.sample_size as u64 * self.time_samples.len() as u64
+    }
+
+    /// Computes the total time across all samples.
+    #[inline]
+    pub fn total_duration(&self) -> FineDuration {
+        FineDuration { picos: self.time_samples.iter().map(|s| s.duration.picos).sum() }
+    }
+
+    /// Returns all samples sorted by duration.
+    #[inline]
+    pub fn sorted_samples(&self) -> Vec<&TimeSample> {
+        let mut result: Vec<&TimeSample> = self.time_samples.iter().collect();
+        result.sort_unstable_by_key(|s| s.duration);
+        result
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/thread_pool.rs b/crates/divan_compat/divan_fork/src/thread_pool.rs
new file mode 100644
index 00000000..c607a936
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/thread_pool.rs
@@ -0,0 +1,389 @@
+use std::{
+    num::NonZeroUsize,
+    panic::AssertUnwindSafe,
+    ptr::NonNull,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        mpsc, Mutex, PoisonError,
+    },
+    thread::Thread,
+};
+
+use crate::util::{defer, sync::SyncWrap};
+
+/// Single shared thread pool for running benchmarks on.
+pub(crate) static BENCH_POOL: ThreadPool = ThreadPool::new();
+
+/// Reusable threads for broadcasting tasks.
+///
+/// This thread pool runs only a single task at a time, since only one benchmark
+/// should run at a time. Invoking `broadcast` from two threads will cause one
+/// thread to wait for the other to finish.
+///
+/// # How It Works
+///
+/// Upon calling `broadcast`:
+///
+/// 1. The main thread creates a `Task`, which is a pointer to a `TaskShared`
+///    pinned on the stack. `TaskShared` stores the function to run, along with
+///    other fields for coordinating threads.
+///
+/// 2. New threads are spawned if the requested amount is not available. Each
+///    receives tasks over an associated channel.
+///
+/// 3. The main thread sends the `Task` over the channels to the requested
+///    amount of threads. Upon receiving the task, each auxiliary thread will
+///    execute it and then decrement the task's reference count.
+///
+/// 4. The main thread executes the `Task` like auxiliary threads. It then waits
+///    until the reference count is 0 before returning.
+pub(crate) struct ThreadPool {
+    threads: Mutex<Vec<mpsc::SyncSender<Task>>>,
+}
+
+impl ThreadPool {
+    const fn new() -> Self {
+        Self { threads: Mutex::new(Vec::new()) }
+    }
+
+    /// Performs the given task and pushes the results into a `vec`.
+    #[inline]
+    pub fn par_extend<T, F>(&self, vec: &mut Vec<Option<T>>, aux_threads: usize, task: F)
+    where
+        F: Sync + Fn(usize) -> T,
+        T: Sync + Send,
+    {
+        unsafe {
+            let old_len = vec.len();
+            let additional = aux_threads + 1;
+
+            vec.reserve_exact(additional);
+            vec.spare_capacity_mut().iter_mut().for_each(|val| {
+                val.write(None);
+            });
+            vec.set_len(old_len + additional);
+
+            let ptr = SyncWrap::new(vec.as_mut_ptr().add(old_len));
+
+            self.broadcast(aux_threads, move |index| {
+                ptr.add(index).write(Some(task(index)));
+            });
+        }
+    }
+
+    /// Performs the given task across the current thread and auxiliary worker
+    /// threads.
+    ///
+    /// This function returns once all threads complete the task.
+    #[inline]
+    pub fn broadcast<F>(&self, aux_threads: usize, task: F)
+    where
+        F: Sync + Fn(usize),
+    {
+        // SAFETY: The `TaskShared` instance is guaranteed to be accessible to
+        // all threads until this function returns, because this thread waits
+        // until `TaskShared.ref_count` is 0 before continuing.
+        unsafe {
+            let task = TaskShared::new(aux_threads, task);
+            let task = Task { shared: NonNull::from(&task).cast() };
+
+            self.broadcast_task(aux_threads, task);
+        }
+    }
+
+    /// Type-erased monomorphized implementation for `broadcast`.
+    unsafe fn broadcast_task(&self, aux_threads: usize, task: Task) {
+        // Send task to auxiliary threads.
+        if aux_threads > 0 {
+            let threads = &mut *self.threads.lock().unwrap_or_else(PoisonError::into_inner);
+
+            // Spawn more threads if necessary.
+            if let Some(additional) = NonZeroUsize::new(aux_threads.saturating_sub(threads.len())) {
+                spawn(additional, threads);
+            }
+
+            for thread in &threads[..aux_threads] {
+                thread.send(task).unwrap();
+            }
+        }
+
+        // Run the task on the main thread.
+        let main_result = std::panic::catch_unwind(AssertUnwindSafe(|| task.run(0)));
+
+        // Wait for other threads to finish writing their results.
+        //
+        // SAFETY: The acquire memory ordering ensures that all writes performed
+        // by the task on other threads will become visible to this thread after
+        // returning from `broadcast`.
+        while task.shared.as_ref().ref_count.load(Ordering::Acquire) > 0 {
+            std::thread::park();
+        }
+
+        // Don't drop our result until other threads finish, in case the panic
+        // error's drop handler itself also panics.
+        drop(main_result);
+    }
+
+    pub fn drop_threads(&self) {
+        *self.threads.lock().unwrap_or_else(PoisonError::into_inner) = Default::default();
+    }
+
+    #[cfg(test)]
+    fn aux_thread_count(&self) -> usize {
+        self.threads.lock().unwrap_or_else(PoisonError::into_inner).len()
+    }
+}
+
+/// Type-erased function and metadata.
+#[derive(Clone, Copy)]
+struct Task {
+    shared: NonNull<TaskShared<()>>,
+}
+
+unsafe impl Send for Task {}
+unsafe impl Sync for Task {}
+
+impl Task {
+    /// Runs this task on behalf of `thread_id`.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure:
+    ///
+    /// - This task has not outlived the `TaskShared` it came from, or else
+    ///   there will be a use-after-free.
+    ///
+    /// - `thread_id` is within the number of `broadcast` threads requested, so
+    ///   that it can be used to index input or output buffers.
+    #[inline]
+    unsafe fn run(&self, thread_id: usize) {
+        let shared_ptr = self.shared.as_ptr();
+        let shared = &*shared_ptr;
+
+        (shared.task_fn_ptr)(shared_ptr.cast(), thread_id);
+    }
+}
+
+/// Data stored on the main thread that gets shared with auxiliary threads.
+///
+/// # Memory Layout
+///
+/// Since the benchmark may have thrashed the cache, this type's fields are
+/// ordered by usage order. This type is also placed on its own cache line.
+#[repr(C)]
+struct TaskShared<F> {
+    /// Once an auxiliary thread sets `ref_count` to 0, it should notify the
+    /// main thread to wake up.
+    main_thread: Thread,
+
+    /// The number of auxiliary threads executing the task.
+    ///
+    /// Once this is 0, the main thread can read any results the task produced.
+    ref_count: AtomicUsize,
+
+    /// Performs `*result = Some(task_fn(thread))`.
+    task_fn_ptr: unsafe fn(task: *const TaskShared<()>, thread: usize),
+
+    /// Stores the closure state of the provided task.
+    ///
+    /// This must be stored as the last field so that all other fields are in
+    /// the same place regardless of this field's type.
+    task_fn: F,
+}
+
+impl<F> TaskShared<F> {
+    #[inline]
+    fn new(aux_threads: usize, task_fn: F) -> Self
+    where
+        F: Sync + Fn(usize),
+    {
+        unsafe fn call<F>(task: *const TaskShared<()>, thread: usize)
+        where
+            F: Fn(usize),
+        {
+            let task_fn = &(*task.cast::<TaskShared<F>>()).task_fn;
+
+            task_fn(thread);
+        }
+
+        Self {
+            main_thread: std::thread::current(),
+            ref_count: AtomicUsize::new(aux_threads),
+            task_fn_ptr: call::<F>,
+            task_fn,
+        }
+    }
+}
+
+/// Spawns N additional threads and appends their channels to the list.
+///
+/// Threads are given names in the form of `divan-$INDEX`.
+#[cold]
+fn spawn(additional: NonZeroUsize, threads: &mut Vec<mpsc::SyncSender<Task>>) {
+    let next_thread_id = threads.len() + 1;
+
+    threads.extend((next_thread_id..(next_thread_id + additional.get())).map(|thread_id| {
+        // Create single-task channel. Unless another benchmark is running, the
+        // current thread will be immediately unblocked after the auxiliary
+        // thread accepts the task.
+        //
+        // This uses a rendezvous channel (capacity 0) instead of other standard
+        // library channels because it reduces memory usage by many kilobytes.
+        let (sender, receiver) = mpsc::sync_channel::<Task>(0);
+
+        let work = move || {
+            // Abort the process if the caught panic error itself panics when
+            // dropped.
+            let panic_guard = defer(|| std::process::abort());
+
+            while let Ok(task) = receiver.recv() {
+                // Run the task on this auxiliary thread.
+                //
+                // SAFETY: The task is valid until `ref_count == 0`.
+                let result =
+                    std::panic::catch_unwind(AssertUnwindSafe(|| unsafe { task.run(thread_id) }));
+
+                // Decrement the `ref_count` count to notify the main thread
+                // that we finished our work.
+                //
+                // SAFETY: This release operation makes writes within the task
+                // become visible to the main thread.
+                unsafe {
+                    // Clone the main thread's handle for unparking because the
+                    // `TaskShared` will be invalidated when `ref_count` is 0.
+                    let main_thread = task.shared.as_ref().main_thread.clone();
+
+                    if task.shared.as_ref().ref_count.fetch_sub(1, Ordering::Release) == 1 {
+                        main_thread.unpark();
+                    }
+                }
+
+                // Don't drop our result until after notifying the main thread,
+                // in case the panic error's drop handler itself also panics.
+                drop(result);
+            }
+
+            std::mem::forget(panic_guard);
+        };
+
+        std::thread::Builder::new()
+            .name(format!("divan-{thread_id}"))
+            .spawn(work)
+            .expect("failed to spawn thread");
+
+        sender
+    }));
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Make every thread write its ID to a buffer and then check that the
+    /// buffer contains all IDs.
+    #[test]
+    fn extend() {
+        static TEST_POOL: ThreadPool = ThreadPool::new();
+
+        fn test(aux_threads: usize, final_aux_threads: usize) {
+            let total_threads = aux_threads + 1;
+
+            let mut results = Vec::new();
+            let expected = (0..total_threads).map(Some).collect::<Vec<_>>();
+
+            TEST_POOL.par_extend(&mut results, aux_threads, |index| index);
+
+            assert_eq!(results, expected);
+            assert_eq!(TEST_POOL.aux_thread_count(), final_aux_threads);
+        }
+
+        test(0, 0);
+        test(1, 1);
+        test(2, 2);
+        test(3, 3);
+        test(4, 4);
+        test(8, 8);
+
+        // Decreasing auxiliary threads on later calls should still leave
+        // previously spawned threads running.
+        test(4, 8);
+        test(0, 8);
+
+        // Silence Miri about leaking threads.
+        TEST_POOL.drop_threads();
+    }
+
+    /// Execute a task that takes longer on all other threads than the main
+    /// thread.
+    #[test]
+    fn broadcast_sleep() {
+        use std::time::Duration;
+
+        static TEST_POOL: ThreadPool = ThreadPool::new();
+
+        TEST_POOL.broadcast(10, |thread_id| {
+            if thread_id > 0 {
+                std::thread::sleep(Duration::from_millis(10));
+            }
+        });
+
+        // Silence Miri about leaking threads.
+        TEST_POOL.drop_threads();
+    }
+
+    /// Checks that thread ID 0 refers to the main thread.
+    #[test]
+    fn broadcast_thread_id() {
+        static TEST_POOL: ThreadPool = ThreadPool::new();
+
+        let main_thread = std::thread::current().id();
+
+        TEST_POOL.broadcast(10, |thread_id| {
+            let is_main = main_thread == std::thread::current().id();
+            assert_eq!(is_main, thread_id == 0);
+        });
+
+        // Silence Miri about leaking threads.
+        TEST_POOL.drop_threads();
+    }
+}
+
+#[cfg(feature = "internal_benches")]
+mod benches {
+    use super::*;
+
+    fn aux_thread_counts() -> impl Iterator<Item = usize> {
+        let mut available_parallelism = std::thread::available_parallelism().ok().map(|n| n.get());
+
+        let range = 0..=16;
+
+        if let Some(n) = available_parallelism {
+            if range.contains(&n) {
+                available_parallelism = None;
+            }
+        }
+
+        range.chain(available_parallelism)
+    }
+
+    /// Benchmarks repeatedly using `ThreadPool` for the same number of threads
+    /// on every run.
+    #[crate::bench(crate = crate, args = aux_thread_counts())]
+    fn broadcast(bencher: crate::Bencher, aux_threads: usize) {
+        let pool = ThreadPool::new();
+        let benched = move || pool.broadcast(aux_threads, crate::black_box_drop);
+
+        // Warmup to spawn threads.
+        benched();
+
+        bencher.bench(benched);
+    }
+
+    /// Benchmarks using `ThreadPool` once.
+    #[crate::bench(crate = crate, args = aux_thread_counts(), sample_size = 1)]
+    fn broadcast_once(bencher: crate::Bencher, aux_threads: usize) {
+        bencher
+            .with_inputs(ThreadPool::new)
+            .bench_refs(|pool| pool.broadcast(aux_threads, crate::black_box_drop));
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/fence.rs b/crates/divan_compat/divan_fork/src/time/fence.rs
new file mode 100644
index 00000000..7e123225
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/fence.rs
@@ -0,0 +1,42 @@
+use std::sync::atomic;
+
+/// Prevents other operations from affecting timing measurements.
+#[inline(always)]
+pub fn full_fence() {
+    asm_fence();
+    atomic::fence(atomic::Ordering::SeqCst);
+}
+
+/// Prevents the compiler from reordering operations.
+#[inline(always)]
+pub fn compiler_fence() {
+    asm_fence();
+    atomic::compiler_fence(atomic::Ordering::SeqCst);
+}
+
+/// Stronger compiler fence on [platforms with stable `asm!`](https://doc.rust-lang.org/nightly/reference/inline-assembly.html).
+///
+/// This prevents LLVM from removing loops or hoisting logic out of the
+/// benchmark loop.
+#[inline(always)]
+fn asm_fence() {
+    // Miri does not support inline assembly.
+    if cfg!(miri) {
+        return;
+    }
+
+    #[cfg(any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        target_arch = "arm",
+        target_arch = "aarch64",
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "loongarch64",
+    ))]
+    // SAFETY: The inline assembly is a no-op.
+    unsafe {
+        // Preserve flags because we don't want to pessimize user logic.
+        std::arch::asm!("", options(nostack, preserves_flags));
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/fine_duration.rs b/crates/divan_compat/divan_fork/src/time/fine_duration.rs
new file mode 100644
index 00000000..566483ed
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/fine_duration.rs
@@ -0,0 +1,467 @@
+use std::{fmt, ops, time::Duration};
+
+use crate::util;
+
+/// [Picosecond](https://en.wikipedia.org/wiki/Picosecond)-precise [`Duration`].
+#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub(crate) struct FineDuration {
+    pub picos: u128,
+}
+
+impl From<Duration> for FineDuration {
+    #[inline]
+    fn from(duration: Duration) -> Self {
+        Self {
+            picos: duration
+                .as_nanos()
+                .checked_mul(1_000)
+                .unwrap_or_else(|| panic!("{duration:?} is too large to fit in `FineDuration`")),
+        }
+    }
+}
+
+impl fmt::Display for FineDuration {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let sig_figs = f.precision().unwrap_or(4);
+
+        let picos = self.picos;
+        let mut scale = TimeScale::from_picos(picos);
+
+        // Prefer formatting picoseconds as nanoseconds if we can. This makes
+        // picoseconds easier to read because they are almost always alongside
+        // nanosecond-scale values.
+        if scale == TimeScale::PicoSec && sig_figs > 3 {
+            scale = TimeScale::NanoSec;
+        }
+
+        let multiple: u128 = {
+            let sig_figs = u32::try_from(sig_figs).unwrap_or(u32::MAX);
+            10_u128.saturating_pow(sig_figs)
+        };
+
+        // TODO: Format without heap allocation.
+        let mut str: String = match picos::DAY.checked_mul(multiple) {
+            Some(int_day) if picos >= int_day => {
+                // Format using integer representation to not lose precision.
+                (picos / picos::DAY).to_string()
+            }
+            _ => {
+                // Format using floating point representation.
+
+                // Multiply to allow `sig_figs` digits of fractional precision.
+                let val = (((picos * multiple) / scale.picos()) as f64) / multiple as f64;
+
+                util::fmt::format_f64(val, sig_figs)
+            }
+        };
+
+        str.push(' ');
+        str.push_str(scale.suffix());
+
+        // Fill up to specified width.
+        if let Some(fill_len) = f.width().and_then(|width| width.checked_sub(str.len())) {
+            match f.align() {
+                None | Some(fmt::Alignment::Left) => {
+                    str.extend(std::iter::repeat(f.fill()).take(fill_len));
+                }
+                _ => return Err(fmt::Error),
+            }
+        }
+
+        f.write_str(&str)
+    }
+}
+
+impl fmt::Debug for FineDuration {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(self, f)
+    }
+}
+
+impl ops::Add for FineDuration {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        Self { picos: self.picos + other.picos }
+    }
+}
+
+impl ops::AddAssign for FineDuration {
+    #[inline]
+    fn add_assign(&mut self, other: Self) {
+        self.picos += other.picos
+    }
+}
+
+impl<I: Into<u128>> ops::Div<I> for FineDuration {
+    type Output = Self;
+
+    #[inline]
+    fn div(self, count: I) -> Self {
+        Self { picos: self.picos / count.into() }
+    }
+}
+
+impl FineDuration {
+    pub const ZERO: Self = Self { picos: 0 };
+
+    pub const MAX: Self = Self { picos: u128::MAX };
+
+    #[inline]
+    pub fn is_zero(&self) -> bool {
+        self.picos == 0
+    }
+
+    /// Round up to `other` if `self` is zero.
+    #[inline]
+    pub fn clamp_to(self, other: Self) -> Self {
+        if self.is_zero() {
+            other
+        } else {
+            self
+        }
+    }
+
+    /// Returns the smaller non-zero value.
+    #[inline]
+    pub fn clamp_to_min(self, other: Self) -> Self {
+        if self.is_zero() {
+            other
+        } else if other.is_zero() {
+            self
+        } else {
+            self.min(other)
+        }
+    }
+}
+
+mod picos {
+    pub const NANOS: u128 = 1_000;
+    pub const MICROS: u128 = 1_000 * NANOS;
+    pub const MILLIS: u128 = 1_000 * MICROS;
+    pub const SEC: u128 = 1_000 * MILLIS;
+    pub const MIN: u128 = 60 * SEC;
+    pub const HOUR: u128 = 60 * MIN;
+    pub const DAY: u128 = 24 * HOUR;
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+enum TimeScale {
+    PicoSec,
+    NanoSec,
+    MicroSec,
+    MilliSec,
+    Sec,
+    Min,
+    Hour,
+    Day,
+}
+
+impl TimeScale {
+    #[cfg(test)]
+    const ALL: &'static [Self] = &[
+        Self::PicoSec,
+        Self::NanoSec,
+        Self::MicroSec,
+        Self::MilliSec,
+        Self::Sec,
+        Self::Min,
+        Self::Hour,
+        Self::Day,
+    ];
+
+    /// Determines the scale of time for representing a number of picoseconds.
+    fn from_picos(picos: u128) -> Self {
+        use picos::*;
+
+        if picos < NANOS {
+            Self::PicoSec
+        } else if picos < MICROS {
+            Self::NanoSec
+        } else if picos < MILLIS {
+            Self::MicroSec
+        } else if picos < SEC {
+            Self::MilliSec
+        } else if picos < MIN {
+            Self::Sec
+        } else if picos < HOUR {
+            Self::Min
+        } else if picos < DAY {
+            Self::Hour
+        } else {
+            Self::Day
+        }
+    }
+
+    /// Returns the number of picoseconds needed to reach this scale.
+    fn picos(self) -> u128 {
+        use picos::*;
+
+        match self {
+            Self::PicoSec => 1,
+            Self::NanoSec => NANOS,
+            Self::MicroSec => MICROS,
+            Self::MilliSec => MILLIS,
+            Self::Sec => SEC,
+            Self::Min => MIN,
+            Self::Hour => HOUR,
+            Self::Day => DAY,
+        }
+    }
+
+    /// Returns the unit suffix.
+    fn suffix(self) -> &'static str {
+        match self {
+            Self::PicoSec => "ps",
+            Self::NanoSec => "ns",
+            Self::MicroSec => "µs",
+            Self::MilliSec => "ms",
+            Self::Sec => "s",
+            Self::Min => "m",
+            Self::Hour => "h",
+            Self::Day => "d",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn clamp_to() {
+        #[track_caller]
+        fn test(a: u128, b: u128, expected: u128) {
+            assert_eq!(
+                FineDuration { picos: a }.clamp_to(FineDuration { picos: b }),
+                FineDuration { picos: expected }
+            );
+        }
+
+        test(0, 0, 0);
+        test(0, 1, 1);
+        test(0, 2, 2);
+        test(0, 3, 3);
+
+        test(1, 0, 1);
+        test(1, 1, 1);
+        test(1, 2, 1);
+        test(1, 3, 1);
+
+        test(2, 0, 2);
+        test(2, 1, 2);
+        test(2, 2, 2);
+        test(2, 3, 2);
+
+        test(3, 0, 3);
+        test(3, 1, 3);
+        test(3, 2, 3);
+        test(3, 3, 3);
+    }
+
+    #[test]
+    fn clamp_to_min() {
+        #[track_caller]
+        fn test(a: u128, b: u128, expected: u128) {
+            assert_eq!(
+                FineDuration { picos: a }.clamp_to_min(FineDuration { picos: b }),
+                FineDuration { picos: expected }
+            );
+        }
+
+        test(0, 0, 0);
+        test(0, 1, 1);
+        test(0, 2, 2);
+        test(0, 3, 3);
+
+        test(1, 0, 1);
+        test(1, 1, 1);
+        test(1, 2, 1);
+        test(1, 3, 1);
+
+        test(2, 0, 2);
+        test(2, 1, 1);
+        test(2, 2, 2);
+        test(2, 3, 2);
+
+        test(3, 0, 3);
+        test(3, 1, 1);
+        test(3, 2, 2);
+        test(3, 3, 3);
+    }
+
+    #[allow(clippy::zero_prefixed_literal)]
+    mod fmt {
+        use super::*;
+
+        #[track_caller]
+        fn test(picos: u128, expected: &str) {
+            let duration = FineDuration { picos };
+            assert_eq!(duration.to_string(), expected);
+            assert_eq!(format!("{duration:.4}"), expected);
+            assert_eq!(format!("{duration:<0}"), expected);
+        }
+
+        macro_rules! assert_fmt_eq {
+            ($input:literal, $expected:literal) => {
+                assert_eq!(format!($input), format!($expected));
+            };
+        }
+
+        #[test]
+        fn precision() {
+            for &scale in TimeScale::ALL {
+                let base_duration = FineDuration { picos: scale.picos() };
+                let incr_duration = FineDuration { picos: scale.picos() + 1 };
+
+                if scale == TimeScale::PicoSec {
+                    assert_eq!(format!("{base_duration:.0}"), "1 ps");
+                    assert_eq!(format!("{incr_duration:.0}"), "2 ps");
+                } else {
+                    let base_string = base_duration.to_string();
+                    assert_eq!(format!("{base_duration:.0}"), base_string);
+                    assert_eq!(format!("{incr_duration:.0}"), base_string);
+                }
+            }
+        }
+
+        #[test]
+        fn fill() {
+            for &scale in TimeScale::ALL {
+                // Picoseconds are formatted as nanoseconds by default.
+                if scale == TimeScale::PicoSec {
+                    continue;
+                }
+
+                let duration = FineDuration { picos: scale.picos() };
+                let suffix = scale.suffix();
+                let pad = " ".repeat(8 - suffix.len());
+
+                assert_fmt_eq!("{duration:<2}", "1 {suffix}");
+                assert_fmt_eq!("{duration:<10}", "1 {suffix}{pad}");
+            }
+        }
+
+        #[test]
+        fn pico_sec() {
+            test(000, "0 ns");
+
+            test(001, "0.001 ns");
+            test(010, "0.01 ns");
+            test(100, "0.1 ns");
+
+            test(102, "0.102 ns");
+            test(120, "0.12 ns");
+            test(123, "0.123 ns");
+            test(012, "0.012 ns");
+        }
+
+        #[test]
+        fn nano_sec() {
+            test(001_000, "1 ns");
+            test(010_000, "10 ns");
+            test(100_000, "100 ns");
+
+            test(100_002, "100 ns");
+            test(100_020, "100 ns");
+            test(100_200, "100.2 ns");
+            test(102_000, "102 ns");
+            test(120_000, "120 ns");
+
+            test(001_002, "1.002 ns");
+            test(001_023, "1.023 ns");
+            test(001_234, "1.234 ns");
+            test(001_230, "1.23 ns");
+            test(001_200, "1.2 ns");
+        }
+
+        #[test]
+        fn micro_sec() {
+            test(001_000_000, "1 µs");
+            test(010_000_000, "10 µs");
+            test(100_000_000, "100 µs");
+
+            test(100_000_002, "100 µs");
+            test(100_000_020, "100 µs");
+            test(100_000_200, "100 µs");
+            test(100_002_000, "100 µs");
+            test(100_020_000, "100 µs");
+            test(100_200_000, "100.2 µs");
+            test(102_000_000, "102 µs");
+
+            test(120_000_000, "120 µs");
+            test(012_000_000, "12 µs");
+            test(001_200_000, "1.2 µs");
+
+            test(001_020_000, "1.02 µs");
+            test(001_002_000, "1.002 µs");
+            test(001_000_200, "1 µs");
+            test(001_000_020, "1 µs");
+            test(001_000_002, "1 µs");
+
+            test(001_230_000, "1.23 µs");
+            test(001_234_000, "1.234 µs");
+            test(001_234_500, "1.234 µs");
+            test(001_234_560, "1.234 µs");
+            test(001_234_567, "1.234 µs");
+        }
+
+        #[test]
+        fn milli_sec() {
+            test(001_000_000_000, "1 ms");
+            test(010_000_000_000, "10 ms");
+            test(100_000_000_000, "100 ms");
+        }
+
+        #[test]
+        fn sec() {
+            test(picos::SEC, "1 s");
+            test(picos::SEC * 10, "10 s");
+            test(picos::SEC * 59, "59 s");
+
+            test(picos::MILLIS * 59_999, "59.99 s");
+        }
+
+        #[test]
+        fn min() {
+            test(picos::MIN, "1 m");
+            test(picos::MIN * 10, "10 m");
+            test(picos::MIN * 59, "59 m");
+
+            test(picos::MILLIS * 3_599_000, "59.98 m");
+            test(picos::MILLIS * 3_599_999, "59.99 m");
+            test(picos::HOUR - 1, "59.99 m");
+        }
+
+        #[test]
+        fn hour() {
+            test(picos::HOUR, "1 h");
+            test(picos::HOUR * 10, "10 h");
+            test(picos::HOUR * 23, "23 h");
+
+            test(picos::MILLIS * 86_300_000, "23.97 h");
+            test(picos::MILLIS * 86_399_999, "23.99 h");
+            test(picos::DAY - 1, "23.99 h");
+        }
+
+        #[test]
+        fn day() {
+            test(picos::DAY, "1 d");
+
+            test(picos::DAY + picos::DAY / 10, "1.1 d");
+            test(picos::DAY + picos::DAY / 100, "1.01 d");
+            test(picos::DAY + picos::DAY / 1000, "1.001 d");
+
+            test(picos::DAY * 000010, "10 d");
+            test(picos::DAY * 000100, "100 d");
+            test(picos::DAY * 001000, "1000 d");
+            test(picos::DAY * 010000, "10000 d");
+            test(picos::DAY * 100000, "100000 d");
+
+            test(u128::MAX / 1000, "3938453320844195178 d");
+            test(u128::MAX, "3938453320844195178974 d");
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/mod.rs b/crates/divan_compat/divan_fork/src/time/mod.rs
new file mode 100644
index 00000000..4fbae76b
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/mod.rs
@@ -0,0 +1,38 @@
+use std::time::Duration;
+
+pub mod fence;
+
+mod fine_duration;
+mod timer;
+mod timestamp;
+
+pub(crate) use fine_duration::*;
+pub(crate) use timer::*;
+pub(crate) use timestamp::*;
+
+/// Private-public trait for being polymorphic over `Duration`.
+pub trait IntoDuration {
+    /// Converts into a `Duration`.
+    fn into_duration(self) -> Duration;
+}
+
+impl IntoDuration for Duration {
+    #[inline]
+    fn into_duration(self) -> Duration {
+        self
+    }
+}
+
+impl IntoDuration for u64 {
+    #[inline]
+    fn into_duration(self) -> Duration {
+        Duration::from_secs(self)
+    }
+}
+
+impl IntoDuration for f64 {
+    #[inline]
+    fn into_duration(self) -> Duration {
+        Duration::from_secs_f64(self)
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/timer.rs b/crates/divan_compat/divan_fork/src/time/timer.rs
new file mode 100644
index 00000000..9e6beb28
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/timer.rs
@@ -0,0 +1,376 @@
+use std::{cmp::Ordering, num::NonZeroU64, sync::OnceLock};
+
+use crate::{
+    alloc::{AllocOp, ThreadAllocInfo},
+    black_box,
+    time::{FineDuration, TscTimestamp, TscUnavailable, UntaggedTimestamp},
+};
+
+/// Measures time.
+#[derive(Clone, Copy, Default)]
+pub(crate) enum Timer {
+    /// Operating system timer.
+    #[default]
+    Os,
+
+    /// CPU timestamp counter.
+    Tsc {
+        /// [`TscTimestamp::frequency`].
+        frequency: NonZeroU64,
+    },
+}
+
+impl Timer {
+    const COUNT: usize = 2;
+
+    /// Returns all available timers.
+    #[cfg(test)]
+    pub fn available() -> Vec<Self> {
+        let mut timers = vec![Self::Os];
+
+        if let Ok(tsc) = Self::get_tsc() {
+            timers.push(tsc);
+        }
+
+        timers
+    }
+
+    /// Attempts to get the CPU timestamp counter.
+    #[inline]
+    pub fn get_tsc() -> Result<Self, TscUnavailable> {
+        Ok(Self::Tsc { frequency: TscTimestamp::frequency()? })
+    }
+
+    #[inline]
+    pub fn kind(self) -> TimerKind {
+        match self {
+            Self::Os => TimerKind::Os,
+            Self::Tsc { .. } => TimerKind::Tsc,
+        }
+    }
+
+    /// Returns the smallest non-zero duration that this timer can measure.
+    ///
+    /// The result is cached.
+    pub fn precision(self) -> FineDuration {
+        static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()];
+
+        let cached = &CACHED[self.kind() as usize];
+
+        *cached.get_or_init(|| self.measure_precision())
+    }
+
+    fn measure_precision(self) -> FineDuration {
+        let timer_kind = self.kind();
+
+        // Start with the worst possible minimum.
+        let mut min_sample = FineDuration::MAX;
+        let mut seen_count = 0;
+
+        // If timing in immediate succession fails to produce a non-zero sample,
+        // an artificial delay is added by looping. `usize` is intentionally
+        // used to make looping cheap.
+        let mut delay_len: usize = 0;
+
+        loop {
+            for _ in 0..100 {
+                // Use `UntaggedTimestamp` to minimize overhead.
+                let sample_start: UntaggedTimestamp;
+                let sample_end: UntaggedTimestamp;
+
+                if delay_len == 0 {
+                    // Immediate succession.
+                    sample_start = UntaggedTimestamp::start(timer_kind);
+                    sample_end = UntaggedTimestamp::end(timer_kind);
+                } else {
+                    // Add delay.
+                    sample_start = UntaggedTimestamp::start(timer_kind);
+                    for n in 0..delay_len {
+                        crate::black_box(n);
+                    }
+                    sample_end = UntaggedTimestamp::end(timer_kind);
+                }
+
+                // SAFETY: These values are guaranteed to be the correct variant
+                // because they were created from the same `timer_kind`.
+                let [sample_start, sample_end] = unsafe {
+                    [sample_start.into_timestamp(timer_kind), sample_end.into_timestamp(timer_kind)]
+                };
+
+                let sample = sample_end.duration_since(sample_start, self);
+
+                // Discard sample if irrelevant.
+                if sample.is_zero() {
+                    continue;
+                }
+
+                match sample.cmp(&min_sample) {
+                    Ordering::Greater => {
+                        // If we already delayed a lot, and not hit the seen
+                        // count threshold, then use current minimum.
+                        if delay_len > 100 {
+                            return min_sample;
+                        }
+                    }
+                    Ordering::Equal => {
+                        seen_count += 1;
+
+                        // If we've seen this min 100 times, we have high
+                        // confidence this is the smallest duration.
+                        if seen_count >= 100 {
+                            return min_sample;
+                        }
+                    }
+                    Ordering::Less => {
+                        min_sample = sample;
+                        seen_count = 0;
+                    }
+                }
+            }
+
+            delay_len = delay_len.saturating_add(1);
+        }
+    }
+
+    /// Returns the overheads added by the benchmarker.
+    ///
+    /// `min_time` and `max_time` do not consider this as benchmarking time.
+    pub fn bench_overheads(self) -> &'static TimedOverhead {
+        // Miri is slow, so don't waste time on this.
+        if cfg!(miri) {
+            return &TimedOverhead::ZERO;
+        }
+
+        static CACHED: [OnceLock<TimedOverhead>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()];
+
+        let cached = &CACHED[self.kind() as usize];
+
+        cached.get_or_init(|| TimedOverhead {
+            sample_loop: self.sample_loop_overhead(),
+            tally_alloc: self.measure_tally_alloc_overhead(),
+            tally_dealloc: self.measure_tally_dealloc_overhead(),
+            tally_realloc: self.measure_tally_realloc_overhead(),
+        })
+    }
+
+    /// Returns the per-iteration overhead of the benchmarking sample loop.
+    fn sample_loop_overhead(self) -> FineDuration {
+        // Miri is slow, so don't waste time on this.
+        if cfg!(miri) {
+            return FineDuration::default();
+        }
+
+        static CACHED: [OnceLock<FineDuration>; Timer::COUNT] = [OnceLock::new(), OnceLock::new()];
+
+        let cached = &CACHED[self.kind() as usize];
+
+        *cached.get_or_init(|| self.measure_sample_loop_overhead())
+    }
+
+    /// Calculates the per-iteration overhead of the benchmarking sample loop.
+    fn measure_sample_loop_overhead(self) -> FineDuration {
+        let timer_kind = self.kind();
+
+        let sample_count: usize = 100;
+        let sample_size: usize = 10_000;
+
+        // The minimum non-zero sample.
+        let mut min_sample = FineDuration::default();
+
+        for _ in 0..sample_count {
+            let start = UntaggedTimestamp::start(timer_kind);
+
+            for i in 0..sample_size {
+                _ = crate::black_box(i);
+            }
+
+            let end = UntaggedTimestamp::end(timer_kind);
+
+            // SAFETY: These values are guaranteed to be the correct variant because
+            // they were created from the same `timer_kind`.
+            let [start, end] =
+                unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] };
+
+            let mut sample = end.duration_since(start, self);
+            sample.picos /= sample_size as u128;
+
+            min_sample = min_sample.clamp_to_min(sample);
+        }
+
+        min_sample
+    }
+
+    fn measure_tally_alloc_overhead(self) -> FineDuration {
+        let size = black_box(0);
+        self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_alloc(size))
+    }
+
+    fn measure_tally_dealloc_overhead(self) -> FineDuration {
+        let size = black_box(0);
+        self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_dealloc(size))
+    }
+
+    fn measure_tally_realloc_overhead(self) -> FineDuration {
+        let new_size = black_box(0);
+        let old_size = black_box(0);
+        self.measure_alloc_info_overhead(|alloc_info| alloc_info.tally_realloc(old_size, new_size))
+    }
+
+    // SAFETY: This function is not reentrant. Calling it within `operation`
+    // would cause aliasing of `ThreadAllocInfo::current`.
+    fn measure_alloc_info_overhead(self, operation: impl Fn(&mut ThreadAllocInfo)) -> FineDuration {
+        // Initialize the current thread's alloc info.
+        let alloc_info = ThreadAllocInfo::current();
+
+        let sample_count = 100;
+        let sample_size = 50_000;
+
+        let result = self.measure_min_time(sample_count, sample_size, || {
+            if let Some(mut alloc_info) = ThreadAllocInfo::try_current() {
+                // SAFETY: We have exclusive access.
+                operation(unsafe { alloc_info.as_mut() });
+            }
+        });
+
+        // Clear alloc info.
+        if let Some(mut alloc_info) = alloc_info {
+            // SAFETY: We have exclusive access.
+            let alloc_info = unsafe { alloc_info.as_mut() };
+
+            alloc_info.clear();
+        }
+
+        result
+    }
+
+    /// Calculates the smallest non-zero time to perform an operation.
+    fn measure_min_time(
+        self,
+        sample_count: usize,
+        sample_size: usize,
+        operation: impl Fn(),
+    ) -> FineDuration {
+        let timer_kind = self.kind();
+
+        let loop_overhead = self.sample_loop_overhead();
+        let mut min_sample = FineDuration::default();
+
+        for _ in 0..sample_count {
+            let start = UntaggedTimestamp::start(timer_kind);
+
+            for _ in 0..sample_size {
+                operation();
+            }
+
+            let end = UntaggedTimestamp::end(timer_kind);
+
+            // SAFETY: These values are guaranteed to be the correct variant
+            // because they were created from the same `timer_kind`.
+            let [start, end] =
+                unsafe { [start.into_timestamp(timer_kind), end.into_timestamp(timer_kind)] };
+
+            let mut sample = end.duration_since(start, self);
+            sample.picos /= sample_size as u128;
+
+            // Remove benchmarking loop overhead.
+            sample.picos = sample.picos.saturating_sub(loop_overhead.picos);
+
+            min_sample = min_sample.clamp_to_min(sample);
+        }
+
+        min_sample
+    }
+}
+
+/// [`Timer`] kind.
+#[derive(Clone, Copy, Default)]
+pub(crate) enum TimerKind {
+    /// Operating system timer.
+    #[default]
+    Os,
+
+    /// CPU timestamp counter.
+    Tsc,
+}
+
+/// The measured overhead of various benchmarking operations.
+pub(crate) struct TimedOverhead {
+    pub sample_loop: FineDuration,
+    pub tally_alloc: FineDuration,
+    pub tally_dealloc: FineDuration,
+    pub tally_realloc: FineDuration,
+}
+
+impl TimedOverhead {
+    pub const ZERO: Self = Self {
+        sample_loop: FineDuration::ZERO,
+        tally_alloc: FineDuration::ZERO,
+        tally_dealloc: FineDuration::ZERO,
+        tally_realloc: FineDuration::ZERO,
+    };
+
+    pub fn total_overhead(&self, sample_size: u32, alloc_info: &ThreadAllocInfo) -> FineDuration {
+        let sample_loop_overhead = self.sample_loop.picos.saturating_mul(sample_size as u128);
+
+        let tally_alloc_overhead = self
+            .tally_alloc
+            .picos
+            .saturating_mul(alloc_info.tallies.get(AllocOp::Alloc).count as u128);
+
+        let tally_dealloc_overhead = self
+            .tally_dealloc
+            .picos
+            .saturating_mul(alloc_info.tallies.get(AllocOp::Dealloc).count as u128);
+
+        let tally_realloc_overhead = self.tally_realloc.picos.saturating_mul(
+            alloc_info.tallies.get(AllocOp::Grow).count as u128
+                + alloc_info.tallies.get(AllocOp::Shrink).count as u128,
+        );
+
+        FineDuration {
+            picos: sample_loop_overhead
+                .saturating_add(tally_alloc_overhead)
+                .saturating_add(tally_dealloc_overhead)
+                .saturating_add(tally_realloc_overhead),
+        }
+    }
+}
+
+#[cfg(feature = "internal_benches")]
+mod benches {
+    use super::*;
+
+    #[crate::bench(crate = crate)]
+    fn get_tsc() -> Result<Timer, TscUnavailable> {
+        Timer::get_tsc()
+    }
+
+    mod measure {
+        use super::*;
+
+        #[crate::bench(crate = crate)]
+        fn precision() -> FineDuration {
+            Timer::Os.measure_precision()
+        }
+
+        #[crate::bench(crate = crate)]
+        fn sample_loop_overhead() -> FineDuration {
+            Timer::Os.measure_sample_loop_overhead()
+        }
+
+        #[crate::bench(crate = crate)]
+        fn tally_alloc_overhead() -> FineDuration {
+            Timer::Os.measure_tally_alloc_overhead()
+        }
+
+        #[crate::bench(crate = crate)]
+        fn tally_dealloc_overhead() -> FineDuration {
+            Timer::Os.measure_tally_dealloc_overhead()
+        }
+
+        #[crate::bench(crate = crate)]
+        fn tally_realloc_overhead() -> FineDuration {
+            Timer::Os.measure_tally_realloc_overhead()
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/timestamp/mod.rs b/crates/divan_compat/divan_fork/src/time/timestamp/mod.rs
new file mode 100644
index 00000000..0124694c
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/timestamp/mod.rs
@@ -0,0 +1,88 @@
+use std::time::Instant;
+
+use crate::time::{fence, FineDuration, Timer, TimerKind};
+
+mod tsc;
+
+pub(crate) use tsc::*;
+
+/// A measurement timestamp.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) enum Timestamp {
+    /// Time provided by the operating system.
+    Os(Instant),
+
+    /// [CPU timestamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter).
+    Tsc(TscTimestamp),
+}
+
+impl Timestamp {
+    #[inline(always)]
+    pub fn start(timer_kind: TimerKind) -> Self {
+        fence::full_fence();
+        let value = match timer_kind {
+            TimerKind::Os => Self::Os(Instant::now()),
+            TimerKind::Tsc => Self::Tsc(TscTimestamp::start()),
+        };
+        fence::compiler_fence();
+        value
+    }
+
+    pub fn duration_since(self, earlier: Self, timer: Timer) -> FineDuration {
+        match (self, earlier, timer) {
+            (Self::Os(this), Self::Os(earlier), Timer::Os) => this.duration_since(earlier).into(),
+            (Self::Tsc(this), Self::Tsc(earlier), Timer::Tsc { frequency }) => {
+                this.duration_since(earlier, frequency)
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// A [`Timestamp`] where the variant is determined by an external source of
+/// truth.
+///
+/// By making the variant tag external to this type, we produce more optimized
+/// code by:
+/// - Reusing the same condition variable
+/// - Reducing the size of the timestamp variables
+#[derive(Clone, Copy)]
+pub(crate) union UntaggedTimestamp {
+    /// [`Timestamp::Os`].
+    pub os: Instant,
+
+    /// [`Timestamp::Tsc`].
+    pub tsc: TscTimestamp,
+}
+
+impl UntaggedTimestamp {
+    #[inline(always)]
+    pub fn start(timer_kind: TimerKind) -> Self {
+        fence::full_fence();
+        let value = match timer_kind {
+            TimerKind::Os => Self { os: Instant::now() },
+            TimerKind::Tsc => Self { tsc: TscTimestamp::start() },
+        };
+        fence::compiler_fence();
+        value
+    }
+
+    #[inline(always)]
+    pub fn end(timer_kind: TimerKind) -> Self {
+        fence::compiler_fence();
+        let value = match timer_kind {
+            TimerKind::Os => Self { os: Instant::now() },
+            TimerKind::Tsc => Self { tsc: TscTimestamp::end() },
+        };
+        fence::full_fence();
+        value
+    }
+
+    #[inline(always)]
+    pub unsafe fn into_timestamp(self, timer_kind: TimerKind) -> Timestamp {
+        match timer_kind {
+            TimerKind::Os => Timestamp::Os(self.os),
+            TimerKind::Tsc => Timestamp::Tsc(self.tsc),
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/timestamp/tsc/aarch64.rs b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/aarch64.rs
new file mode 100644
index 00000000..deff9ee4
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/aarch64.rs
@@ -0,0 +1,37 @@
+use std::arch::asm;
+
+use crate::time::TscUnavailable;
+
+/// Reads the [`cntfrq_el0`](https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CNTFRQ-EL0--Counter-timer-Frequency-register?lang=en)
+/// register.
+///
+/// This value is set on system initialization and thus does not change between
+/// reads.
+#[inline]
+pub(crate) fn frequency() -> Result<u64, TscUnavailable> {
+    unsafe {
+        let frequency: u64;
+        asm!(
+            "mrs {}, cntfrq_el0",
+            out(reg) frequency,
+            options(nomem, nostack, preserves_flags, pure),
+        );
+        Ok(frequency)
+    }
+}
+
+/// Reads the [`cntvct_el0`](https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-register?lang=en)
+/// register.
+#[inline(always)]
+pub(crate) fn timestamp() -> u64 {
+    unsafe {
+        let timestamp: u64;
+        asm!(
+            "mrs {}, cntvct_el0",
+            out(reg) timestamp,
+            // Leave off `nomem` because this should be a compiler fence.
+            options(nostack, preserves_flags),
+        );
+        timestamp
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/timestamp/tsc/mod.rs b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/mod.rs
new file mode 100644
index 00000000..c8f2455f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/mod.rs
@@ -0,0 +1,112 @@
+use std::{fmt, num::NonZeroU64};
+
+use crate::time::FineDuration;
+
+#[cfg(target_arch = "aarch64")]
+#[path = "aarch64.rs"]
+mod arch;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[path = "x86.rs"]
+mod arch;
+
+/// [CPU timestamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter).
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub(crate) struct TscTimestamp {
+    pub value: u64,
+}
+
+impl TscTimestamp {
+    /// Gets the timestamp frequency.
+    ///
+    /// On AArch64, this simply reads `cntfrq_el0`. On x86, this measures the
+    /// TSC frequency.
+    #[inline]
+    #[allow(unreachable_code)]
+    pub fn frequency() -> Result<NonZeroU64, TscUnavailable> {
+        // Miri does not support inline assembly.
+        #[cfg(miri)]
+        return Err(TscUnavailable::Unimplemented);
+
+        #[cfg(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64"))]
+        return NonZeroU64::new(arch::frequency()?).ok_or(TscUnavailable::ZeroFrequency);
+
+        Err(TscUnavailable::Unimplemented)
+    }
+
+    /// Reads the timestamp counter.
+    #[inline(always)]
+    pub fn start() -> Self {
+        #[allow(unused)]
+        let value = 0;
+
+        #[cfg(target_arch = "aarch64")]
+        let value = arch::timestamp();
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        let value = arch::start_timestamp();
+
+        Self { value }
+    }
+
+    /// Reads the timestamp counter.
+    #[inline(always)]
+    pub fn end() -> Self {
+        #[allow(unused)]
+        let value = 0;
+
+        #[cfg(target_arch = "aarch64")]
+        let value = arch::timestamp();
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        let value = arch::end_timestamp();
+
+        Self { value }
+    }
+
+    pub fn duration_since(self, earlier: Self, frequency: NonZeroU64) -> FineDuration {
+        const PICOS: u128 = 1_000_000_000_000;
+
+        let Some(diff) = self.value.checked_sub(earlier.value) else {
+            return Default::default();
+        };
+
+        FineDuration { picos: (diff as u128 * PICOS) / frequency.get() as u128 }
+    }
+}
+
+/// Reason for why the timestamp counter cannot be used.
+#[derive(Clone, Copy)]
+pub(crate) enum TscUnavailable {
+    /// Not yet implemented for this platform.
+    Unimplemented,
+
+    /// Got a frequency of 0.
+    ZeroFrequency,
+
+    /// Missing the appropriate instructions.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    MissingInstructions,
+
+    /// The timestamp counter is not guaranteed to be constant.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    VariableFrequency,
+}
+
+impl fmt::Display for TscUnavailable {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let reason = match self {
+            Self::Unimplemented => "unimplemented",
+            Self::ZeroFrequency => "zero TSC frequency",
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Self::MissingInstructions => "missing instructions",
+
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Self::VariableFrequency => "variable TSC frequency",
+        };
+
+        f.write_str(reason)
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/time/timestamp/tsc/x86.rs b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/x86.rs
new file mode 100644
index 00000000..d1df9d71
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/time/timestamp/tsc/x86.rs
@@ -0,0 +1,273 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86;
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64 as x86;
+
+use std::time::{Duration, Instant};
+
+use crate::time::{fence, TscUnavailable};
+
+#[inline(always)]
+pub(crate) fn start_timestamp() -> u64 {
+    // Serialize previous operations before `rdtsc` to ensure they are not
+    // inside the timed section.
+    util::lfence();
+
+    let tsc = util::rdtsc();
+
+    // Serialize `rdtsc` before any measured code.
+    util::lfence();
+
+    tsc
+}
+
+#[inline(always)]
+pub(crate) fn end_timestamp() -> u64 {
+    // `rdtscp` is serialized after all previous operations.
+    let tsc = util::rdtscp();
+
+    // Serialize `rdtscp` before any subsequent code.
+    util::lfence();
+
+    tsc
+}
+
+pub(crate) fn frequency() -> Result<u64, TscUnavailable> {
+    if !util::tsc_is_available() {
+        return Err(TscUnavailable::MissingInstructions);
+    }
+
+    if !util::tsc_is_invariant() {
+        return Err(TscUnavailable::VariableFrequency);
+    }
+
+    let nominal = nominal_frequency();
+    let measured = measure::measure_frequency();
+
+    // Use the nominal frequency if within 0.1% of the measured frequency.
+    //
+    // The nominal frequency is used for getting an exact value if the measured
+    // frequency is slightly off. It is not blindly trusted because it may not
+    // match the TSC frequency.
+    if let Some(nominal) = nominal {
+        if measured * 0.999 < nominal && nominal < measured * 1.001 {
+            return Ok(nominal.round() as u64);
+        }
+    }
+
+    Ok(measured.round() as u64)
+}
+
+/// Parses the CPU frequency in the brand name, e.g. "2.50GHz".
+fn nominal_frequency() -> Option<f64> {
+    let name = util::cpu_name()?;
+    let name = {
+        let len = name.iter().position(|&ch| ch == 0).unwrap_or(name.len());
+        std::str::from_utf8(&name[..len]).ok()?
+    };
+
+    #[rustfmt::skip]
+    let frequencies = [
+        ("MHz", 1e6),
+        ("GHz", 1e9),
+        ("THz", 1e12),
+    ];
+
+    for (unit, scale) in frequencies {
+        let Some(unit_start) = name.find(unit) else {
+            continue;
+        };
+
+        let pre_unit = &name[..unit_start];
+        let num = match pre_unit.rsplit_once(' ') {
+            Some((_, num)) => num,
+            None => pre_unit,
+        };
+
+        if let Ok(num) = num.parse::<f64>() {
+            return Some(num * scale);
+        };
+    }
+
+    None
+}
+
+mod util {
+    use super::*;
+
+    #[inline(always)]
+    pub fn rdtsc() -> u64 {
+        fence::compiler_fence();
+
+        // SAFETY: Reading the TSC is memory safe.
+        let tsc = unsafe { x86::_rdtsc() };
+
+        fence::compiler_fence();
+        tsc
+    }
+
+    #[inline(always)]
+    pub fn rdtscp() -> u64 {
+        fence::compiler_fence();
+
+        // SAFETY: Reading the TSC is memory safe.
+        let tsc = unsafe { x86::__rdtscp(&mut 0) };
+
+        fence::compiler_fence();
+        tsc
+    }
+
+    #[inline(always)]
+    pub fn lfence() {
+        // SAFETY: A load fence is memory safe.
+        unsafe { x86::_mm_lfence() }
+    }
+
+    #[inline]
+    fn cpuid(leaf: u32) -> x86::CpuidResult {
+        // SAFETY: `cpuid` is never unsafe to call.
+        unsafe { x86::__cpuid(leaf) }
+    }
+
+    /// Invokes CPUID and converts its output registers to an ordered array.
+    #[inline]
+    fn cpuid_array(leaf: u32) -> [u32; 4] {
+        let cpuid = cpuid(leaf);
+        [cpuid.eax, cpuid.ebx, cpuid.ecx, cpuid.edx]
+    }
+
+    /// Returns `true` if the given CPUID leaf is available.
+    #[inline]
+    fn cpuid_has_leaf(leaf: u32) -> bool {
+        cpuid(0x8000_0000).eax >= leaf
+    }
+
+    /// Returns `true` if CPUID indicates that the `rdtsc` and `rdtscp`
+    /// instructions are available.
+    #[inline]
+    pub fn tsc_is_available() -> bool {
+        let bits = cpuid(0x8000_0001).edx;
+
+        let rdtsc = 1 << 4;
+        let rdtscp = 1 << 27;
+
+        bits & (rdtsc | rdtscp) != 0
+    }
+
+    /// Returns `true` if CPUID indicates that the timestamp counter has a
+    /// constant frequency.
+    #[inline]
+    pub fn tsc_is_invariant() -> bool {
+        let leaf = 0x8000_0007;
+
+        if !cpuid_has_leaf(leaf) {
+            return false;
+        }
+
+        cpuid(leaf).edx & (1 << 8) != 0
+    }
+
+    /// Returns the processor model name as a null-terminated ASCII string.
+    pub fn cpu_name() -> Option<[u8; 48]> {
+        if !cpuid_has_leaf(0x8000_0004) {
+            return None;
+        }
+
+        #[rustfmt::skip]
+        let result = [
+            cpuid_array(0x8000_0002),
+            cpuid_array(0x8000_0003),
+            cpuid_array(0x8000_0004),
+        ];
+
+        // SAFETY: Converting from `u32` to bytes.
+        Some(unsafe { std::mem::transmute(result) })
+    }
+}
+
+mod measure {
+    use super::*;
+
+    /// Returns the TSC frequency by measuring it.
+    pub fn measure_frequency() -> f64 {
+        const TRIES: usize = 8;
+
+        // Start with delay of 1ms up to 256ms (2^TRIES).
+        let mut delay_ms = 1;
+
+        let mut prev_measure = f64::NEG_INFINITY;
+        let mut measures = [0.0; TRIES];
+
+        for slot in &mut measures {
+            let measure = measure_frequency_once(Duration::from_millis(delay_ms));
+
+            // This measurement is sufficiently accurate if within 0.1% of the
+            // previous.
+            if measure * 0.999 < prev_measure && prev_measure < measure * 1.001 {
+                return measure;
+            }
+
+            *slot = measure;
+            prev_measure = measure;
+
+            delay_ms *= 2;
+        }
+
+        // If no frequencies were within 0.1% of each other, find the frequency
+        // with the smallest delta.
+        let mut min_delta = f64::INFINITY;
+        let mut result_index = 0;
+
+        for i in 0..TRIES {
+            for j in (i + 1)..TRIES {
+                let delta = (measures[i] - measures[j]).abs();
+
+                if delta < min_delta {
+                    min_delta = delta;
+                    result_index = i;
+                }
+            }
+        }
+
+        measures[result_index]
+    }
+
+    fn measure_frequency_once(delay: Duration) -> f64 {
+        let (start_tsc, start_instant) = tsc_instant_pair();
+        std::thread::sleep(delay);
+        let (end_tsc, end_instant) = tsc_instant_pair();
+
+        let elapsed_tsc = end_tsc.saturating_sub(start_tsc);
+        let elapsed_duration = end_instant.duration_since(start_instant);
+
+        (elapsed_tsc as f64 / elapsed_duration.as_nanos() as f64) * 1e9
+    }
+
+    /// Returns a timestamp/instant pair that has a small latency between
+    /// getting the two values.
+    fn tsc_instant_pair() -> (u64, Instant) {
+        let mut best_latency = Duration::MAX;
+        let mut best_pair = (0, Instant::now());
+
+        // Make up to 100 attempts to get a low latency pair.
+        for _ in 0..100 {
+            let instant = Instant::now();
+            let tsc = util::rdtsc();
+            let latency = instant.elapsed();
+
+            let pair = (tsc, instant);
+
+            if latency.is_zero() {
+                return pair;
+            }
+
+            if latency < best_latency {
+                best_latency = latency;
+                best_pair = pair;
+            }
+        }
+
+        best_pair
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/tree_painter.rs b/crates/divan_compat/divan_fork/src/tree_painter.rs
new file mode 100644
index 00000000..7e5d668f
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/tree_painter.rs
@@ -0,0 +1,517 @@
+//! Happy little trees.
+
+use std::{io::Write, iter::repeat};
+
+use crate::{
+    alloc::{AllocOp, AllocTally},
+    counter::{AnyCounter, BytesFormat, KnownCounterKind},
+    stats::{Stats, StatsSet},
+    util,
+};
+
+const TREE_COL_BUF: usize = 2;
+
+/// Paints tree-style output using box-drawing characters.
+pub(crate) struct TreePainter {
+    /// The maximum number of characters taken by a name and its prefix. Emitted
+    /// information should be left-padded to start at this column.
+    max_name_span: usize,
+
+    column_widths: [usize; TreeColumn::COUNT],
+
+    depth: usize,
+
+    /// The current prefix to the name and content, e.g.
+    /// <code>│     │  </code> for three levels of nesting with the second level
+    /// being on the last node.
+    current_prefix: String,
+
+    /// Buffer for writing to before printing to stdout.
+    write_buf: String,
+}
+
+impl TreePainter {
+    pub fn new(max_name_span: usize, column_widths: [usize; TreeColumn::COUNT]) -> Self {
+        Self {
+            max_name_span,
+            column_widths,
+            depth: 0,
+            current_prefix: String::new(),
+            write_buf: String::new(),
+        }
+    }
+}
+
+impl TreePainter {
+    /// Enter a parent node.
+    pub fn start_parent(&mut self, name: &str, is_last: bool) {
+        let is_top_level = self.depth == 0;
+        let has_columns = self.has_columns();
+
+        let buf = &mut self.write_buf;
+        buf.clear();
+
+        let branch = if is_top_level {
+            ""
+        } else if !is_last {
+            "├─ "
+        } else {
+            "╰─ "
+        };
+        buf.extend([self.current_prefix.as_str(), branch, name]);
+
+        // Right-pad name if `has_columns`
+        if has_columns {
+            let max_span = self.max_name_span;
+            let buf_len = buf.chars().count();
+            let pad_len = TREE_COL_BUF + max_span.saturating_sub(buf_len);
+            buf.extend(repeat(' ').take(pad_len));
+
+            if buf_len > max_span {
+                self.max_name_span = buf_len;
+            }
+        }
+
+        // Write column headings.
+        if has_columns && is_top_level {
+            let names = TreeColumnData::from_fn(TreeColumn::name);
+            names.write(buf, &mut self.column_widths);
+        }
+
+        // Write column spacers.
+        if has_columns && !is_top_level {
+            TreeColumnData([""; TreeColumn::COUNT]).write(buf, &mut self.column_widths);
+        }
+
+        println!("{buf}");
+
+        self.depth += 1;
+
+        if !is_top_level {
+            self.current_prefix.push_str(if !is_last { "│  " } else { "   " });
+        }
+    }
+
+    /// Exit the current parent node.
+    pub fn finish_parent(&mut self) {
+        self.depth -= 1;
+
+        // Improve legibility for multiple top-level parents.
+        if self.depth == 0 {
+            println!();
+        }
+
+        // The prefix is extended by 3 `char`s at a time.
+        let new_prefix_len = {
+            let mut iter = self.current_prefix.chars();
+            _ = iter.by_ref().rev().nth(2);
+            iter.as_str().len()
+        };
+        self.current_prefix.truncate(new_prefix_len);
+    }
+
+    /// Indicate that the next child node was ignored.
+    ///
+    /// This semantically combines start/finish operations.
+    pub fn ignore_leaf(&mut self, name: &str, is_last: bool) {
+        let has_columns = self.has_columns();
+
+        let buf = &mut self.write_buf;
+        buf.clear();
+
+        let branch = if !is_last { "├─ " } else { "╰─ " };
+        buf.extend([self.current_prefix.as_str(), branch, name]);
+
+        right_pad_buffer(buf, &mut self.max_name_span);
+
+        if has_columns {
+            TreeColumnData::from_first("(ignored)").write(buf, &mut self.column_widths);
+        } else {
+            buf.push_str("(ignored)");
+        }
+
+        println!("{buf}");
+    }
+
+    /// Enter a leaf node.
+    pub fn start_leaf(&mut self, name: &str, is_last: bool) {
+        let has_columns = self.has_columns();
+
+        let buf = &mut self.write_buf;
+        buf.clear();
+
+        let branch = if !is_last { "├─ " } else { "╰─ " };
+        buf.extend([self.current_prefix.as_str(), branch, name]);
+
+        // Right-pad buffer if this leaf will have info displayed.
+        if has_columns {
+            let max_span = self.max_name_span;
+            let buf_len = buf.chars().count();
+            let pad_len = TREE_COL_BUF + max_span.saturating_sub(buf_len);
+            buf.extend(repeat(' ').take(pad_len));
+
+            if buf_len > max_span {
+                self.max_name_span = buf_len;
+            }
+        }
+
+        print!("{buf}");
+        _ = std::io::stdout().flush();
+    }
+
+    /// Exit the current leaf node.
+    pub fn finish_empty_leaf(&mut self) {
+        println!();
+    }
+
+    /// Exit the current leaf node, emitting statistics.
+    pub fn finish_leaf(&mut self, is_last: bool, stats: &Stats, bytes_format: BytesFormat) {
+        let prep_buffer = |buf: &mut String, max_span: &mut usize| {
+            buf.clear();
+            buf.push_str(&self.current_prefix);
+
+            if !is_last {
+                buf.push('│');
+            }
+
+            right_pad_buffer(buf, max_span);
+        };
+
+        let buf = &mut self.write_buf;
+        buf.clear();
+
+        // Serialize max alloc counts and sizes early so we can resize columns
+        // early.
+        let serialized_max_alloc_counts = if stats.max_alloc.size.is_zero() {
+            None
+        } else {
+            Some(TreeColumn::ALL.map(|column| {
+                let Some(&max_alloc_count) = column.get_stat(&stats.max_alloc.count) else {
+                    return String::new();
+                };
+
+                let prefix = if column.is_first() { "  " } else { "" };
+                format!("{prefix}{}", util::fmt::format_f64(max_alloc_count, 4))
+            }))
+        };
+
+        let serialized_max_alloc_sizes = if stats.max_alloc.size.is_zero() {
+            None
+        } else {
+            Some(TreeColumn::ALL.map(|column| {
+                let Some(&max_alloc_size) = column.get_stat(&stats.max_alloc.size) else {
+                    return String::new();
+                };
+
+                let prefix = if column.is_first() { "  " } else { "" };
+                format!("{prefix}{}", util::fmt::format_bytes(max_alloc_size, 4, bytes_format))
+            }))
+        };
+
+        // Serialize alloc tallies early so we can resize columns early.
+        let serialized_alloc_tallies = AllocOp::ALL.map(|op| {
+            let tally = stats.alloc_tallies.get(op);
+
+            if tally.is_zero() {
+                return None;
+            }
+
+            let column_tallies = TreeColumn::ALL.map(|column| {
+                let prefix = if column.is_first() { "  " } else { "" };
+
+                let tally = AllocTally {
+                    count: column.get_stat(&tally.count).copied()?,
+                    size: column.get_stat(&tally.size).copied()?,
+                };
+
+                Some((prefix, tally))
+            });
+
+            Some(AllocTally {
+                count: column_tallies.map(|tally| {
+                    if let Some((prefix, tally)) = tally {
+                        format!("{prefix}{}", util::fmt::format_f64(tally.count, 4))
+                    } else {
+                        String::new()
+                    }
+                }),
+                size: column_tallies.map(|tally| {
+                    if let Some((prefix, tally)) = tally {
+                        format!("{prefix}{}", util::fmt::format_bytes(tally.size, 4, bytes_format))
+                    } else {
+                        String::new()
+                    }
+                }),
+            })
+        });
+
+        // Serialize counter stats early so we can resize columns early.
+        let serialized_counters = KnownCounterKind::ALL.map(|counter_kind| {
+            let counter_stats = stats.get_counts(counter_kind);
+
+            TreeColumn::ALL
+                .map(|column| -> Option<String> {
+                    let count = *column.get_stat(counter_stats?)?;
+                    let time = *column.get_stat(&stats.time)?;
+
+                    Some(
+                        AnyCounter::known(counter_kind, count)
+                            .display_throughput(time, bytes_format)
+                            .to_string(),
+                    )
+                })
+                .map(Option::unwrap_or_default)
+        });
+
+        // Set column widths based on serialized strings.
+        for column in TreeColumn::time_stats() {
+            let width = &mut self.column_widths[column as usize];
+
+            let mut update_width = |s: &str| {
+                *width = (*width).max(s.chars().count());
+            };
+
+            for counter in &serialized_counters {
+                update_width(&counter[column as usize]);
+            }
+
+            let serialized_max_alloc_counts = serialized_max_alloc_counts.iter().flatten();
+            let serialized_max_alloc_sizes = serialized_max_alloc_sizes.iter().flatten();
+            for s in serialized_max_alloc_counts.chain(serialized_max_alloc_sizes) {
+                update_width(s);
+            }
+
+            for s in serialized_alloc_tallies
+                .iter()
+                .flatten()
+                .flat_map(AllocTally::as_array)
+                .map(|values| &values[column as usize])
+            {
+                update_width(s);
+            }
+        }
+
+        // Write time stats with iter and sample counts.
+        TreeColumnData::from_fn(|column| -> String {
+            let stat: &dyn ToString = match column {
+                TreeColumn::Fastest => &stats.time.fastest,
+                TreeColumn::Slowest => &stats.time.slowest,
+                TreeColumn::Median => &stats.time.median,
+                TreeColumn::Mean => &stats.time.mean,
+                TreeColumn::Samples => &stats.sample_count,
+                TreeColumn::Iters => &stats.iter_count,
+            };
+            stat.to_string()
+        })
+        .as_ref::<str>()
+        .write(buf, &mut self.column_widths);
+
+        println!("{buf}");
+
+        // Write counter stats.
+        let counter_stats = serialized_counters.map(TreeColumnData);
+        for counter_kind in KnownCounterKind::ALL {
+            let counter_stats = counter_stats[counter_kind as usize].as_ref::<str>();
+
+            // Skip empty rows.
+            if counter_stats.0.iter().all(|s| s.is_empty()) {
+                continue;
+            }
+
+            prep_buffer(buf, &mut self.max_name_span);
+
+            counter_stats.write(buf, &mut self.column_widths);
+            println!("{buf}");
+        }
+
+        // Write max allocated bytes.
+        if serialized_max_alloc_counts.is_some() || serialized_max_alloc_sizes.is_some() {
+            prep_buffer(buf, &mut self.max_name_span);
+
+            TreeColumnData::from_first("max alloc:").write(buf, &mut self.column_widths);
+            println!("{buf}");
+
+            for serialized in
+                [serialized_max_alloc_counts.as_ref(), serialized_max_alloc_sizes.as_ref()]
+                    .into_iter()
+                    .flatten()
+            {
+                prep_buffer(buf, &mut self.max_name_span);
+
+                TreeColumnData::from_fn(|column| serialized[column as usize].as_str())
+                    .write(buf, &mut self.column_widths);
+
+                println!("{buf}");
+            }
+        }
+
+        // Write allocation tallies.
+        for op in [AllocOp::Alloc, AllocOp::Dealloc, AllocOp::Grow, AllocOp::Shrink] {
+            let Some(tallies) = &serialized_alloc_tallies[op as usize] else {
+                continue;
+            };
+
+            prep_buffer(buf, &mut self.max_name_span);
+
+            TreeColumnData::from_first(op.prefix()).write(buf, &mut self.column_widths);
+            println!("{buf}");
+
+            for value in tallies.as_array() {
+                prep_buffer(buf, &mut self.max_name_span);
+
+                TreeColumnData::from_fn(|column| value[column as usize].as_str())
+                    .write(buf, &mut self.column_widths);
+
+                println!("{buf}");
+            }
+        }
+    }
+
+    fn has_columns(&self) -> bool {
+        !self.column_widths.iter().all(|&w| w == 0)
+    }
+}
+
+/// Columns of the table next to the tree.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub(crate) enum TreeColumn {
+    Fastest,
+    Slowest,
+    Median,
+    Mean,
+    Samples,
+    Iters,
+}
+
+impl TreeColumn {
+    pub const COUNT: usize = 6;
+
+    pub const ALL: [Self; Self::COUNT] = {
+        use TreeColumn::*;
+        [Fastest, Slowest, Median, Mean, Samples, Iters]
+    };
+
+    #[inline]
+    pub fn time_stats() -> impl Iterator<Item = Self> {
+        use TreeColumn::*;
+        [Fastest, Slowest, Median, Mean].into_iter()
+    }
+
+    #[inline]
+    pub fn is_first(self) -> bool {
+        let [first, ..] = Self::ALL;
+        self == first
+    }
+
+    #[inline]
+    pub fn is_last(self) -> bool {
+        let [.., last] = Self::ALL;
+        self == last
+    }
+
+    fn name(self) -> &'static str {
+        match self {
+            Self::Fastest => "fastest",
+            Self::Slowest => "slowest",
+            Self::Median => "median",
+            Self::Mean => "mean",
+            Self::Samples => "samples",
+            Self::Iters => "iters",
+        }
+    }
+
+    #[inline]
+    pub fn is_time_stat(self) -> bool {
+        use TreeColumn::*;
+        matches!(self, Fastest | Slowest | Median | Mean)
+    }
+
+    #[inline]
+    fn get_stat<T>(self, stats: &StatsSet<T>) -> Option<&T> {
+        match self {
+            Self::Fastest => Some(&stats.fastest),
+            Self::Slowest => Some(&stats.slowest),
+            Self::Median => Some(&stats.median),
+            Self::Mean => Some(&stats.mean),
+            Self::Samples | Self::Iters => None,
+        }
+    }
+}
+
+#[derive(Default)]
+struct TreeColumnData<T>([T; TreeColumn::COUNT]);
+
+impl<T> TreeColumnData<T> {
+    #[inline]
+    fn from_first(value: T) -> Self
+    where
+        Self: Default,
+    {
+        let mut data = Self::default();
+        data.0[0] = value;
+        data
+    }
+
+    #[inline]
+    fn from_fn<F>(f: F) -> Self
+    where
+        F: FnMut(TreeColumn) -> T,
+    {
+        Self(TreeColumn::ALL.map(f))
+    }
+}
+
+impl TreeColumnData<&str> {
+    /// Writes the column data into the buffer.
+    fn write(&self, buf: &mut String, column_widths: &mut [usize; TreeColumn::COUNT]) {
+        for (column, value) in self.0.iter().enumerate() {
+            let is_first = column == 0;
+            let is_last = column == TreeColumn::COUNT - 1;
+
+            let value_width = value.chars().count();
+
+            // Write separator.
+            if !is_first {
+                let mut sep = " │ ";
+
+                // Prevent trailing spaces.
+                if is_last && value_width == 0 {
+                    sep = &sep[..sep.len() - 1];
+                };
+
+                buf.push_str(sep);
+            }
+
+            buf.push_str(value);
+
+            // Right-pad remaining width or update column width to new maximum.
+            if !is_last {
+                if let Some(rem_width) = column_widths[column].checked_sub(value_width) {
+                    buf.extend(repeat(' ').take(rem_width));
+                } else {
+                    column_widths[column] = value_width;
+                }
+            }
+        }
+    }
+}
+
+impl<T> TreeColumnData<T> {
+    #[inline]
+    fn as_ref<U: ?Sized>(&self) -> TreeColumnData<&U>
+    where
+        T: AsRef<U>,
+    {
+        TreeColumnData::from_fn(|column| self.0[column as usize].as_ref())
+    }
+}
+
+fn right_pad_buffer(buf: &mut String, max_span: &mut usize) {
+    let buf_len = buf.chars().count();
+    let pad_len = TREE_COL_BUF + max_span.saturating_sub(buf_len);
+    buf.extend(repeat(' ').take(pad_len));
+
+    if buf_len > *max_span {
+        *max_span = buf_len;
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/util/fmt.rs b/crates/divan_compat/divan_fork/src/util/fmt.rs
new file mode 100644
index 00000000..4b8a4bec
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/fmt.rs
@@ -0,0 +1,229 @@
+use std::fmt;
+
+use crate::counter::{AnyCounter, BytesFormat, KnownCounterKind};
+
+/// Formats an `f64` to the given number of significant figures.
+pub(crate) fn format_f64(val: f64, sig_figs: usize) -> String {
+    let mut str = val.to_string();
+
+    if let Some(dot_index) = str.find('.') {
+        let fract_digits = sig_figs.saturating_sub(dot_index);
+
+        if fract_digits == 0 {
+            str.truncate(dot_index);
+        } else {
+            let fract_start = dot_index + 1;
+            let fract_end = fract_start + fract_digits;
+            let fract_range = fract_start..fract_end;
+
+            if let Some(fract_str) = str.get(fract_range) {
+                // Get the offset from the end before all 0s.
+                let pre_zero = fract_str.bytes().rev().enumerate().find_map(|(i, b)| {
+                    if b != b'0' {
+                        Some(i)
+                    } else {
+                        None
+                    }
+                });
+
+                if let Some(pre_zero) = pre_zero {
+                    str.truncate(fract_end - pre_zero);
+                } else {
+                    str.truncate(dot_index);
+                }
+            }
+        }
+    }
+
+    str
+}
+
+pub(crate) fn format_bytes(val: f64, sig_figs: usize, bytes_format: BytesFormat) -> String {
+    let (val, scale) = scale_value(val, bytes_format);
+
+    let mut result = format_f64(val, sig_figs);
+    result.push(' ');
+    result.push_str(scale.suffix(ScaleFormat::Bytes(bytes_format)));
+    result
+}
+
+pub(crate) struct DisplayThroughput<'a> {
+    pub counter: &'a AnyCounter,
+    pub picos: f64,
+    pub bytes_format: BytesFormat,
+}
+
+impl fmt::Debug for DisplayThroughput<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(self, f)
+    }
+}
+
+impl fmt::Display for DisplayThroughput<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let picos = self.picos;
+        let count = self.counter.count();
+        let count_per_sec = if count == 0 { 0. } else { count as f64 * (1e12 / picos) };
+
+        let format = match self.counter.kind {
+            KnownCounterKind::Bytes => ScaleFormat::BytesThroughput(self.bytes_format),
+            KnownCounterKind::Chars => ScaleFormat::CharsThroughput,
+            KnownCounterKind::Cycles => ScaleFormat::CyclesThroughput,
+            KnownCounterKind::Items => ScaleFormat::ItemsThroughput,
+        };
+
+        let (val, scale) = scale_value(count_per_sec, format.bytes_format());
+
+        let sig_figs = f.precision().unwrap_or(4);
+
+        let mut str = format_f64(val, sig_figs);
+        str.push(' ');
+        str.push_str(scale.suffix(format));
+
+        // Fill up to specified width.
+        if let Some(fill_len) = f.width().and_then(|width| width.checked_sub(str.len())) {
+            match f.align() {
+                None | Some(fmt::Alignment::Left) => {
+                    str.extend(std::iter::repeat(f.fill()).take(fill_len));
+                }
+                _ => return Err(fmt::Error),
+            }
+        }
+
+        f.write_str(&str)
+    }
+}
+
+/// Converts a value to the appropriate scale.
+fn scale_value(value: f64, bytes_format: BytesFormat) -> (f64, Scale) {
+    let starts = scale_starts(bytes_format);
+
+    let scale = if value.is_infinite() || value < starts[1] {
+        Scale::One
+    } else if value < starts[2] {
+        Scale::Kilo
+    } else if value < starts[3] {
+        Scale::Mega
+    } else if value < starts[4] {
+        Scale::Giga
+    } else if value < starts[5] {
+        Scale::Tera
+    } else {
+        Scale::Peta
+    };
+
+    (value / starts[scale as usize], scale)
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum Scale {
+    One,
+    Kilo,
+    Mega,
+    Giga,
+    Tera,
+    Peta,
+}
+
+#[derive(Clone, Copy)]
+pub(crate) enum ScaleFormat {
+    Bytes(BytesFormat),
+    BytesThroughput(BytesFormat),
+    CharsThroughput,
+    CyclesThroughput,
+    ItemsThroughput,
+}
+
+impl ScaleFormat {
+    pub fn bytes_format(self) -> BytesFormat {
+        match self {
+            Self::Bytes(format) | Self::BytesThroughput(format) => format,
+            Self::CharsThroughput | Self::CyclesThroughput | Self::ItemsThroughput => {
+                BytesFormat::Decimal
+            }
+        }
+    }
+}
+
+fn scale_starts(bytes_format: BytesFormat) -> &'static [f64; Scale::COUNT] {
+    const STARTS: &[[f64; Scale::COUNT]; 2] = &[
+        [1., 1e3, 1e6, 1e9, 1e12, 1e15],
+        [
+            1.,
+            1024.,
+            1024u64.pow(2) as f64,
+            1024u64.pow(3) as f64,
+            1024u64.pow(4) as f64,
+            1024u64.pow(5) as f64,
+        ],
+    ];
+
+    &STARTS[bytes_format as usize]
+}
+
+impl Scale {
+    const COUNT: usize = 6;
+
+    pub fn suffix(self, format: ScaleFormat) -> &'static str {
+        match format {
+            ScaleFormat::Bytes(format) => {
+                const SUFFIXES: &[[&str; Scale::COUNT]; 2] = &[
+                    ["B", "KB", "MB", "GB", "TB", "PB"],
+                    ["B", "KiB", "MiB", "GiB", "TiB", "PiB"],
+                ];
+
+                SUFFIXES[format as usize][self as usize]
+            }
+            ScaleFormat::BytesThroughput(format) => {
+                const SUFFIXES: &[[&str; Scale::COUNT]; 2] = &[
+                    ["B/s", "KB/s", "MB/s", "GB/s", "TB/s", "PB/s"],
+                    ["B/s", "KiB/s", "MiB/s", "GiB/s", "TiB/s", "PiB/s"],
+                ];
+
+                SUFFIXES[format as usize][self as usize]
+            }
+            ScaleFormat::CharsThroughput => {
+                const SUFFIXES: &[&str; Scale::COUNT] =
+                    &["char/s", "Kchar/s", "Mchar/s", "Gchar/s", "Tchar/s", "Pchar/s"];
+
+                SUFFIXES[self as usize]
+            }
+            ScaleFormat::CyclesThroughput => {
+                const SUFFIXES: &[&str; Scale::COUNT] = &["Hz", "KHz", "MHz", "GHz", "THz", "PHz"];
+
+                SUFFIXES[self as usize]
+            }
+            ScaleFormat::ItemsThroughput => {
+                const SUFFIXES: &[&str; Scale::COUNT] =
+                    &["item/s", "Kitem/s", "Mitem/s", "Gitem/s", "Titem/s", "Pitem/s"];
+
+                SUFFIXES[self as usize]
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn scale_value() {
+        #[track_caller]
+        fn test(n: f64, format: BytesFormat, expected_value: f64, expected_scale: Scale) {
+            assert_eq!(super::scale_value(n, format), (expected_value, expected_scale));
+        }
+
+        #[track_caller]
+        fn test_decimal(n: f64, expected_value: f64, expected_scale: Scale) {
+            test(n, BytesFormat::Decimal, expected_value, expected_scale);
+        }
+
+        test_decimal(1., 1., Scale::One);
+        test_decimal(1_000., 1., Scale::Kilo);
+        test_decimal(1_000_000., 1., Scale::Mega);
+        test_decimal(1_000_000_000., 1., Scale::Giga);
+        test_decimal(1_000_000_000_000., 1., Scale::Tera);
+        test_decimal(1_000_000_000_000_000., 1., Scale::Peta);
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/util/mod.rs b/crates/divan_compat/divan_fork/src/util/mod.rs
new file mode 100644
index 00000000..6ac8cbfb
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/mod.rs
@@ -0,0 +1,106 @@
+use std::{
+    mem::ManuallyDrop,
+    num::NonZeroUsize,
+    sync::atomic::{AtomicUsize, Ordering::Relaxed},
+};
+
+pub mod fmt;
+pub mod sort;
+pub mod sync;
+pub mod thread;
+pub mod ty;
+
+/// Public-in-private type like `()` but meant to be externally-unreachable.
+///
+/// Using this in place of `()` for `GenI` prevents `Bencher::with_inputs` from
+/// working with `()` unintentionally.
+#[non_exhaustive]
+pub struct Unit;
+
+#[inline]
+pub(crate) fn defer<F: FnOnce()>(f: F) -> impl Drop {
+    struct Defer<F: FnOnce()>(ManuallyDrop<F>);
+
+    impl<F: FnOnce()> Drop for Defer<F> {
+        #[inline]
+        fn drop(&mut self) {
+            let f = unsafe { ManuallyDrop::take(&mut self.0) };
+
+            f();
+        }
+    }
+
+    Defer(ManuallyDrop::new(f))
+}
+
+/// Returns the index of `ptr` in the slice, assuming it is in the slice.
+#[inline]
+pub(crate) fn slice_ptr_index<T>(slice: &[T], ptr: *const T) -> usize {
+    // Safe pointer `offset_from`.
+    (ptr as usize - slice.as_ptr() as usize) / size_of::<T>()
+}
+
+/// Returns the values in the middle of `slice`.
+///
+/// If the slice has an even length, two middle values exist.
+#[inline]
+pub(crate) fn slice_middle<T>(slice: &[T]) -> &[T] {
+    let len = slice.len();
+
+    if len == 0 {
+        slice
+    } else if len % 2 == 0 {
+        &slice[(len / 2) - 1..][..2]
+    } else {
+        &slice[len / 2..][..1]
+    }
+}
+
+/// Cached [`std::thread::available_parallelism`].
+#[inline]
+pub(crate) fn known_parallelism() -> NonZeroUsize {
+    static CACHED: AtomicUsize = AtomicUsize::new(0);
+
+    #[cold]
+    fn slow() -> NonZeroUsize {
+        let n = std::thread::available_parallelism().unwrap_or(NonZeroUsize::MIN);
+
+        match CACHED.compare_exchange(0, n.get(), Relaxed, Relaxed) {
+            Ok(_) => n,
+
+            // SAFETY: Zero is checked by us and competing threads.
+            Err(n) => unsafe { NonZeroUsize::new_unchecked(n) },
+        }
+    }
+
+    match NonZeroUsize::new(CACHED.load(Relaxed)) {
+        Some(n) => n,
+        None => slow(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::black_box;
+
+    use super::*;
+
+    #[test]
+    fn known_parallelism() {
+        let f: fn() -> NonZeroUsize = super::known_parallelism;
+        assert_eq!(black_box(f)(), black_box(f)());
+    }
+
+    #[test]
+    fn slice_middle() {
+        use super::slice_middle;
+
+        // assert_eq!(slice_middle::<i32>(&[]), &[]);
+
+        assert_eq!(slice_middle(&[1]), &[1]);
+        assert_eq!(slice_middle(&[1, 2]), &[1, 2]);
+        assert_eq!(slice_middle(&[1, 2, 3]), &[2]);
+        assert_eq!(slice_middle(&[1, 2, 3, 4]), &[2, 3]);
+        assert_eq!(slice_middle(&[1, 2, 3, 4, 5]), &[3]);
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/util/sort.rs b/crates/divan_compat/divan_fork/src/util/sort.rs
new file mode 100644
index 00000000..d86d416a
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/sort.rs
@@ -0,0 +1,139 @@
+use std::cmp::Ordering;
+
+/// Compares strings by treating internal integers as atomic units.
+pub fn natural_cmp(a: &str, b: &str) -> Ordering {
+    Iterator::cmp(Tokenizer { input: a }, Tokenizer { input: b })
+}
+
+#[inline]
+fn cmp_int(mut a: &str, mut b: &str) -> Ordering {
+    a = a.trim_start_matches('0');
+    b = b.trim_start_matches('0');
+
+    // Compare to 0.
+    match (a.is_empty(), b.is_empty()) {
+        (true, true) => return Ordering::Equal,
+        (true, false) => return Ordering::Less,
+        (false, true) => return Ordering::Greater,
+        _ => {}
+    }
+
+    // Compare length.
+    match a.len().cmp(&b.len()) {
+        Ordering::Equal => {}
+        ord => return ord,
+    }
+
+    // Compare digits.
+    a.cmp(b)
+}
+
+#[derive(PartialEq, Eq)]
+#[cfg_attr(test, derive(Debug))]
+struct Token<'a> {
+    is_int: bool,
+    text: &'a str,
+}
+
+impl PartialOrd for Token<'_> {
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Token<'_> {
+    #[inline]
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.is_int && other.is_int {
+            cmp_int(self.text, other.text)
+        } else {
+            self.text.cmp(other.text)
+        }
+    }
+}
+
+/// Lexes a string into "tokens".
+struct Tokenizer<'a> {
+    /// The remaining characters to process.
+    input: &'a str,
+}
+
+impl<'a> Iterator for Tokenizer<'a> {
+    type Item = Token<'a>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut bytes = self.input.bytes();
+        let is_int = bytes.next()?.is_ascii_digit();
+
+        let mut kind_len = 1;
+        for ch in bytes {
+            // Stop on character kind change.
+            if ch.is_ascii_digit() != is_int {
+                break;
+            }
+
+            kind_len += 1;
+        }
+
+        unsafe {
+            let text = self.input.get_unchecked(..kind_len);
+            self.input = self.input.get_unchecked(kind_len..);
+
+            Some(Token { is_int, text })
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[track_caller]
+    fn test_sort(list: &[&str], cmp: fn(&str, &str) -> Ordering) {
+        let mut copy = list.to_vec();
+        copy.sort_by(|a, b| cmp(a, b));
+        assert_eq!(list, copy);
+    }
+
+    #[test]
+    fn natural_cmp() {
+        #[track_caller]
+        fn test(list: &[&str]) {
+            test_sort(list, super::natural_cmp);
+        }
+
+        test(&["A<4>", "A<8>", "A<16>", "A<32>", "A<64>"]);
+    }
+
+    #[test]
+    fn cmp_int() {
+        #[track_caller]
+        fn test(list: &[&str]) {
+            test_sort(list, super::cmp_int);
+        }
+
+        test(&["4", "8", "16", "32", "64"]);
+        test(&["4", "08"]);
+        test(&["0", "00"]);
+    }
+
+    #[test]
+    fn tokenize() {
+        #[track_caller]
+        fn test(s: &str, expected: &[Token]) {
+            let tokens: Vec<Token> = Tokenizer { input: s }.collect();
+            assert_eq!(tokens, expected);
+        }
+
+        test(
+            "A<4>",
+            &[
+                Token { text: "A<", is_int: false },
+                Token { text: "4", is_int: true },
+                Token { text: ">", is_int: false },
+            ],
+        );
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/util/sync.rs b/crates/divan_compat/divan_fork/src/util/sync.rs
new file mode 100644
index 00000000..d84f07be
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/sync.rs
@@ -0,0 +1,121 @@
+//! Synchronization utilities.
+
+#![cfg_attr(not(target_os = "macos"), allow(unused))]
+
+use std::{
+    ops::{Deref, DerefMut},
+    sync::atomic::*,
+};
+
+/// Makes the wrapped value [`Send`] + [`Sync`] even though it isn't.
+pub struct SyncWrap<T> {
+    pub value: T,
+}
+
+unsafe impl<T> Sync for SyncWrap<T> {}
+
+impl<T> Deref for SyncWrap<T> {
+    type Target = T;
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        &self.value
+    }
+}
+
+impl<T> DerefMut for SyncWrap<T> {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.value
+    }
+}
+
+impl<T> SyncWrap<T> {
+    #[inline]
+    pub const unsafe fn new(value: T) -> Self {
+        Self { value }
+    }
+}
+
+/// A convenience wrapper around `AtomicBool`.
+pub(crate) struct AtomicFlag(AtomicBool);
+
+impl AtomicFlag {
+    #[inline]
+    pub const fn new(value: bool) -> Self {
+        Self(AtomicBool::new(value))
+    }
+
+    #[inline]
+    pub fn get(&self) -> bool {
+        self.0.load(Ordering::Relaxed)
+    }
+
+    #[inline]
+    pub fn set(&self, value: bool) {
+        self.0.store(value, Ordering::Relaxed);
+    }
+}
+
+/// Prevents false sharing by aligning to the cache line.
+#[derive(Clone, Copy)]
+#[repr(align(64))]
+pub(crate) struct CachePadded<T>(pub T);
+
+/// Alias to the atomic equivalent of `T`.
+pub(crate) type Atomic<T> = <T as WithAtomic>::Atomic;
+
+/// A type with an associated atomic type.
+pub(crate) trait WithAtomic {
+    type Atomic;
+}
+
+#[cfg(target_has_atomic = "ptr")]
+impl WithAtomic for usize {
+    type Atomic = AtomicUsize;
+}
+
+#[cfg(target_has_atomic = "ptr")]
+impl WithAtomic for isize {
+    type Atomic = AtomicIsize;
+}
+
+#[cfg(target_has_atomic = "8")]
+impl WithAtomic for u8 {
+    type Atomic = AtomicU8;
+}
+
+#[cfg(target_has_atomic = "8")]
+impl WithAtomic for i8 {
+    type Atomic = AtomicI8;
+}
+
+#[cfg(target_has_atomic = "16")]
+impl WithAtomic for u16 {
+    type Atomic = AtomicU16;
+}
+
+#[cfg(target_has_atomic = "16")]
+impl WithAtomic for i16 {
+    type Atomic = AtomicI16;
+}
+
+#[cfg(target_has_atomic = "32")]
+impl WithAtomic for u32 {
+    type Atomic = AtomicU32;
+}
+
+#[cfg(target_has_atomic = "32")]
+impl WithAtomic for i32 {
+    type Atomic = AtomicI32;
+}
+
+#[cfg(target_has_atomic = "64")]
+impl WithAtomic for u64 {
+    type Atomic = AtomicU64;
+}
+
+#[cfg(target_has_atomic = "64")]
+impl WithAtomic for i64 {
+    type Atomic = AtomicI64;
+}
diff --git a/crates/divan_compat/divan_fork/src/util/thread.rs b/crates/divan_compat/divan_fork/src/util/thread.rs
new file mode 100644
index 00000000..6262889e
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/thread.rs
@@ -0,0 +1,193 @@
+//! Threading utilities.
+
+#![cfg(target_os = "macos")]
+
+use std::{marker::PhantomData, ptr::NonNull, sync::atomic::Ordering::*};
+
+use libc::pthread_key_t;
+
+use crate::util::sync::Atomic;
+
+const KEY_UNINIT: pthread_key_t = 0;
+
+/// Thread-local key accessed via
+/// [`pthread_getspecific`](https://pubs.opengroup.org/onlinepubs/9699919799/functions/pthread_getspecific.html).
+pub(crate) struct PThreadKey<T: 'static> {
+    value: AtomicPThreadKey,
+    marker: PhantomData<&'static T>,
+}
+
+impl<T> PThreadKey<T> {
+    #[inline]
+    pub const fn new() -> Self {
+        Self { value: AtomicPThreadKey::new(KEY_UNINIT), marker: PhantomData }
+    }
+
+    #[inline]
+    pub fn get(&self) -> Option<NonNull<T>> {
+        match self.value.load(Relaxed) {
+            KEY_UNINIT => None,
+
+            key => unsafe {
+                cfg_if::cfg_if! {
+                    if #[cfg(all(
+                        not(miri),
+                        any(target_arch = "x86_64", target_arch = "aarch64"),
+                    ))] {
+                        let thread_local = fast::get_thread_local(key as usize);
+
+                        #[cfg(test)]
+                        assert_eq!(thread_local, libc::pthread_getspecific(key));
+                    } else {
+                        let thread_local = libc::pthread_getspecific(key);
+                    }
+                }
+
+                NonNull::new(thread_local.cast())
+            },
+        }
+    }
+
+    /// Assigns the value with its destructor.
+    #[inline]
+    pub fn set<D>(&self, ptr: *const T, _: D) -> bool
+    where
+        D: FnOnce(NonNull<T>) + Copy,
+    {
+        assert_eq!(size_of::<D>(), 0);
+
+        unsafe extern "C" fn dtor<T, D>(ptr: *mut libc::c_void)
+        where
+            T: 'static,
+            D: FnOnce(NonNull<T>) + Copy,
+        {
+            // SAFETY: The dtor is zero-sized, so we can make one from thin air.
+            let dtor: D = unsafe { std::mem::zeroed() };
+
+            // Although we're guaranteed `ptr` is not null, check in case.
+            if let Some(ptr) = NonNull::new(ptr) {
+                dtor(ptr.cast());
+            }
+        }
+
+        let shared_key = &self.value;
+        let mut local_key = shared_key.load(Relaxed);
+
+        // Race against other threads to initialize `shared_key`.
+        if local_key == KEY_UNINIT {
+            if unsafe { libc::pthread_key_create(&mut local_key, Some(dtor::<T, D>)) } == 0 {
+                // Race to store our key into the global instance.
+                //
+                // On failure, delete our key and use the winner's key.
+                if let Err(their_key) =
+                    shared_key.compare_exchange(KEY_UNINIT, local_key, Relaxed, Relaxed)
+                {
+                    // SAFETY: No other thread is accessing this key.
+                    unsafe { libc::pthread_key_delete(local_key) };
+
+                    local_key = their_key;
+                }
+            } else {
+                // On create failure, check if another thread succeeded.
+                local_key = shared_key.load(Relaxed);
+                if local_key == KEY_UNINIT {
+                    return false;
+                }
+            }
+        }
+
+        // This is the slow path, so don't bother with writing via
+        // `gs`/`tpidrro_el0` register.
+        //
+        // SAFETY: The key has been created by us or another thread.
+        unsafe { libc::pthread_setspecific(local_key, ptr.cast()) == 0 }
+    }
+}
+
+/// Alias to the atomic equivalent of `pthread_key_t`.
+pub(crate) type AtomicPThreadKey = Atomic<pthread_key_t>;
+
+/// Optimized alternatives to `pthread_getspecific`.
+pub(crate) mod fast {
+    // Apple reserves key 11 (`__PTK_LIBC_RESERVED_WIN64`) for Windows:
+    // https://github.com/apple-oss-distributions/libpthread/blob/libpthread-519/private/pthread/tsd_private.h#L99
+    //
+    // Key 6 is also reserved for Windows and Go, but we don't use it because
+    // it's more well known and likely to be used by more libraries.
+
+    /// Returns a pointer to a static thread-local variable.
+    #[inline]
+    #[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
+    pub fn get_static_thread_local<T>() -> *const T {
+        unsafe {
+            let result;
+            std::arch::asm!(
+                "mov {}, gs:[88]",
+                out(reg) result,
+                options(pure, readonly, nostack, preserves_flags),
+            );
+            result
+        }
+    }
+
+    /// Sets the static thread-local variable.
+    ///
+    /// # Safety
+    ///
+    /// If the slot is in use, we will corrupt the other user's memory.
+    #[inline]
+    #[cfg(all(not(miri), not(feature = "dyn_thread_local"), target_arch = "x86_64"))]
+    pub unsafe fn set_static_thread_local<T>(ptr: *const T) {
+        unsafe {
+            std::arch::asm!(
+                "mov gs:[88], {}",
+                in(reg) ptr,
+                options(nostack, preserves_flags),
+            );
+        }
+    }
+
+    /// Returns a pointer to the corresponding thread-local variable.
+    ///
+    /// The first element is reserved for `pthread_self`. This is widely known
+    /// and also mentioned in page 251 of "*OS Internals Volume 1" by Jonathan
+    /// Levin.
+    ///
+    /// It appears that `pthread_key_create` allocates a slot into the buffer
+    /// referenced by:
+    /// - [`gs` on x86_64](https://github.com/apple-oss-distributions/xnu/blob/xnu-10002.41.9/libsyscall/os/tsd.h#L126)
+    /// - [`tpidrro_el0` on AArch64](https://github.com/apple-oss-distributions/xnu/blob/xnu-10002.41.9/libsyscall/os/tsd.h#L163)
+    ///
+    /// # Safety
+    ///
+    /// `key` must not cause an out-of-bounds lookup.
+    #[inline]
+    #[cfg(all(not(miri), any(target_arch = "x86_64", target_arch = "aarch64")))]
+    pub unsafe fn get_thread_local(key: usize) -> *mut libc::c_void {
+        #[cfg(target_arch = "x86_64")]
+        {
+            let result;
+            std::arch::asm!(
+                "mov {}, gs:[8 * {1}]",
+                out(reg) result,
+                in(reg) key,
+                options(pure, readonly, nostack, preserves_flags),
+            );
+            result
+        }
+
+        #[cfg(target_arch = "aarch64")]
+        {
+            let result: *const *mut libc::c_void;
+            std::arch::asm!(
+                "mrs {0}, tpidrro_el0",
+                // Clear bottom 3 bits just in case. This was historically the CPU
+                // core ID but that changed at some point.
+                "and {0}, {0}, #-8",
+                out(reg) result,
+                options(pure, nomem, nostack, preserves_flags),
+            );
+            *result.add(key)
+        }
+    }
+}
diff --git a/crates/divan_compat/divan_fork/src/util/ty.rs b/crates/divan_compat/divan_fork/src/util/ty.rs
new file mode 100644
index 00000000..0fb7f536
--- /dev/null
+++ b/crates/divan_compat/divan_fork/src/util/ty.rs
@@ -0,0 +1,38 @@
+use std::{
+    any::{Any, TypeId},
+    marker::PhantomData,
+};
+
+/// Returns a [`TypeId`] for any type regardless of whether it is `'static`.
+///
+/// Note that **this is not the same** as [`TypeId::of`].
+#[inline]
+pub(crate) fn proxy_type_id<T: ?Sized>() -> TypeId {
+    // Return the type ID of a generic closure.
+    Any::type_id(&|| PhantomData::<T>)
+}
+
+/// Returns `true` if the given types are equal.
+#[inline]
+pub(crate) fn is_type_eq<A: ?Sized, B: ?Sized>() -> bool {
+    proxy_type_id::<A>() == proxy_type_id::<B>()
+}
+
+/// Convenience trait for type conversions.
+pub(crate) trait TypeCast {
+    /// Converts a reference if `self` is an instance of `T`.
+    ///
+    /// We require `T: 'static` since we want to ensure when providing a type
+    /// that any lifetimes are static, such as `Cow<str>`.
+    #[inline]
+    fn cast_ref<T: 'static>(&self) -> Option<&T> {
+        if is_type_eq::<Self, T>() {
+            // SAFETY: `self` is `&T`.
+            Some(unsafe { &*(self as *const Self as *const T) })
+        } else {
+            None
+        }
+    }
+}
+
+impl<A> TypeCast for A {}
diff --git a/crates/divan_compat/divan_fork/tests/attr_options.rs b/crates/divan_compat/divan_fork/tests/attr_options.rs
new file mode 100644
index 00000000..126c94e6
--- /dev/null
+++ b/crates/divan_compat/divan_fork/tests/attr_options.rs
@@ -0,0 +1,59 @@
+// Tests that attribute options produce the correct results.
+
+// Miri cannot discover benchmarks.
+#![cfg(not(miri))]
+
+use std::sync::atomic::{AtomicUsize, Ordering::SeqCst};
+
+extern crate codspeed_divan_compat_walltime as divan;
+use divan::Divan;
+
+static CHILD1_ITERS: AtomicUsize = AtomicUsize::new(0);
+static CHILD2_ITERS: AtomicUsize = AtomicUsize::new(0);
+static CHILD3_ITERS: AtomicUsize = AtomicUsize::new(0);
+
+#[divan::bench_group(sample_count = 10, sample_size = 50)]
+mod parent {
+    use super::*;
+
+    // 10 × 1 = 10
+    #[divan::bench_group(sample_size = 1)]
+    mod child1 {
+        use super::*;
+
+        #[divan::bench]
+        fn bench() {
+            CHILD1_ITERS.fetch_add(1, SeqCst);
+        }
+    }
+
+    // 42 × 50 = 2100
+    #[divan::bench_group(sample_count = 42)]
+    mod child2 {
+        use super::*;
+
+        #[divan::bench]
+        fn bench() {
+            CHILD2_ITERS.fetch_add(1, SeqCst);
+        }
+    }
+
+    mod child3 {
+        use super::*;
+
+        // 1 × 50 = 50
+        #[divan::bench(sample_count = 1)]
+        fn bench() {
+            CHILD3_ITERS.fetch_add(1, SeqCst);
+        }
+    }
+}
+
+#[test]
+fn iter_count() {
+    Divan::default().run_benches();
+
+    assert_eq!(CHILD1_ITERS.load(SeqCst), 10);
+    assert_eq!(CHILD2_ITERS.load(SeqCst), 2100);
+    assert_eq!(CHILD3_ITERS.load(SeqCst), 50);
+}
diff --git a/crates/divan_compat/divan_fork/tests/entry_properties.rs b/crates/divan_compat/divan_fork/tests/entry_properties.rs
new file mode 100644
index 00000000..0e5b7ea0
--- /dev/null
+++ b/crates/divan_compat/divan_fork/tests/entry_properties.rs
@@ -0,0 +1,122 @@
+// Tests that entry benchmarks/groups have correct generated properties.
+
+// Miri cannot discover benchmarks.
+#![cfg(not(miri))]
+
+extern crate codspeed_divan_compat_walltime as divan;
+use divan::__private::{EntryMeta, BENCH_ENTRIES, GROUP_ENTRIES};
+
+#[divan::bench]
+fn outer() {}
+
+#[divan::bench_group]
+mod outer_group {
+    #[divan::bench]
+    fn inner() {}
+
+    #[divan::bench_group]
+    mod inner_group {}
+}
+
+#[divan::bench]
+#[ignore]
+fn ignored_1() {}
+
+#[divan::bench(ignore)]
+fn ignored_2() {}
+
+#[divan::bench_group]
+#[allow(unused_attributes)]
+#[ignore]
+mod ignored_group {
+    #[divan::bench]
+    fn not_yet_ignored() {}
+}
+
+/// Finds `EntryMeta` based on the entry's raw name.
+macro_rules! find_meta {
+    ($entries:expr, $raw_name:literal) => {
+        $entries
+            .iter()
+            .map(|entry| &entry.meta)
+            .find(|common| common.raw_name == $raw_name)
+            .expect(concat!($raw_name, " not found"))
+    };
+}
+
+fn find_outer() -> &'static EntryMeta {
+    find_meta!(BENCH_ENTRIES, "outer")
+}
+
+fn find_inner() -> &'static EntryMeta {
+    find_meta!(BENCH_ENTRIES, "inner")
+}
+
+fn find_outer_group() -> &'static EntryMeta {
+    find_meta!(GROUP_ENTRIES, "outer_group")
+}
+
+fn find_inner_group() -> &'static EntryMeta {
+    find_meta!(GROUP_ENTRIES, "inner_group")
+}
+
+#[test]
+fn file() {
+    let file = file!();
+
+    assert_eq!(find_outer().location.file, file);
+    assert_eq!(find_outer_group().location.file, file);
+
+    assert_eq!(find_inner().location.file, file);
+    assert_eq!(find_inner_group().location.file, file);
+}
+
+#[test]
+fn module_path() {
+    let outer_path = module_path!();
+    assert_eq!(find_outer().module_path, outer_path);
+    assert_eq!(find_outer_group().module_path, outer_path);
+
+    let inner_path = format!("{outer_path}::outer_group");
+    assert_eq!(find_inner().module_path, inner_path);
+    assert_eq!(find_inner_group().module_path, inner_path);
+}
+
+#[ignore = "changed within the fork"]
+#[test]
+fn line() {
+    assert_eq!(find_outer().location.line, 8);
+    assert_eq!(find_outer_group().location.line, 11);
+
+    assert_eq!(find_inner().location.line, 13);
+    assert_eq!(find_inner_group().location.line, 16);
+}
+
+#[test]
+fn column() {
+    assert_eq!(find_outer().location.col, 1);
+    assert_eq!(find_outer_group().location.col, 1);
+
+    assert_eq!(find_inner().location.col, 5);
+    assert_eq!(find_inner_group().location.col, 5);
+}
+
+#[test]
+fn ignore() {
+    fn get_ignore(meta: &EntryMeta) -> bool {
+        meta.bench_options.as_ref().and_then(|options| options.ignore).unwrap_or_default()
+    }
+
+    assert!(get_ignore(find_meta!(BENCH_ENTRIES, "ignored_1")));
+    assert!(get_ignore(find_meta!(BENCH_ENTRIES, "ignored_2")));
+    assert!(get_ignore(find_meta!(GROUP_ENTRIES, "ignored_group")));
+
+    // Although its parent is marked as `#[ignore]`, it itself is not yet known
+    // to be ignored.
+    assert!(!get_ignore(find_meta!(BENCH_ENTRIES, "not_yet_ignored")));
+
+    assert!(!get_ignore(find_inner()));
+    assert!(!get_ignore(find_inner_group()));
+    assert!(!get_ignore(find_outer()));
+    assert!(!get_ignore(find_outer_group()));
+}
diff --git a/crates/divan_compat/divan_fork/tests/forbid_unsafe.rs b/crates/divan_compat/divan_fork/tests/forbid_unsafe.rs
new file mode 100644
index 00000000..e7bba2db
--- /dev/null
+++ b/crates/divan_compat/divan_fork/tests/forbid_unsafe.rs
@@ -0,0 +1,85 @@
+// Exhaustively tests that macros work when linting against `unsafe`.
+
+#![forbid(unsafe_code)]
+
+extern crate codspeed_divan_compat_walltime as divan;
+use divan::Bencher;
+
+const CONST_VALUES: [usize; 3] = [1, 5, 10];
+
+#[divan::bench]
+fn freestanding() {}
+
+#[divan::bench(types = [i32, &str])]
+fn freestanding_generic_type<T>() {}
+
+#[divan::bench(consts = [1, 5, 10])]
+fn freestanding_generic_const1<const N: usize>() {}
+
+#[divan::bench(consts = CONST_VALUES)]
+fn freestanding_generic_const2<const N: usize>() {}
+
+#[divan::bench(types = [i32, &str], consts = [1, 5, 10])]
+fn freestanding_generic_type_const1<T, const N: usize>() {}
+
+#[divan::bench(types = [i32, &str], consts = CONST_VALUES)]
+fn freestanding_generic_type_const2<T, const N: usize>() {}
+
+#[divan::bench]
+fn contextual(_: Bencher) {}
+
+#[divan::bench(types = [i32, &str])]
+fn contextual_generic_type<T>(_: Bencher) {}
+
+#[divan::bench(consts = [1, 5, 10])]
+fn contextual_generic_const_1<const N: usize>(_: Bencher) {}
+
+#[divan::bench(consts = CONST_VALUES)]
+fn contextual_generic_const_2<const N: usize>(_: Bencher) {}
+
+#[divan::bench(types = [i32, &str], consts = [1, 5, 10])]
+fn contextual_generic_type_const_1<T, const N: usize>(_: Bencher) {}
+
+#[divan::bench(types = [i32, &str], consts = CONST_VALUES)]
+fn contextual_generic_type_const_2<T, const N: usize>(_: Bencher) {}
+
+#[divan::bench_group]
+mod group {
+    use super::*;
+
+    #[divan::bench]
+    fn freestanding() {}
+
+    #[divan::bench(types = [i32, &str])]
+    fn freestanding_generic_type<T>() {}
+
+    #[divan::bench(consts = [1, 5, 10])]
+    fn freestanding_generic_const1<const N: usize>() {}
+
+    #[divan::bench(consts = CONST_VALUES)]
+    fn freestanding_generic_const2<const N: usize>() {}
+
+    #[divan::bench(types = [i32, &str], consts = [1, 5, 10])]
+    fn freestanding_generic_type_const1<T, const N: usize>() {}
+
+    #[divan::bench(types = [i32, &str], consts = CONST_VALUES)]
+    fn freestanding_generic_type_const2<T, const N: usize>() {}
+
+    #[divan::bench]
+    fn contextual(_: Bencher) {}
+
+    #[divan::bench(types = [i32, &str])]
+    fn contextual_generic_type<T>(_: Bencher) {}
+
+    #[divan::bench(consts = [1, 5, 10])]
+    fn contextual_generic_const1<const N: usize>(_: Bencher) {}
+
+    #[divan::bench(consts = CONST_VALUES)]
+    fn contextual_generic_const2<const N: usize>(_: Bencher) {}
+
+    #[divan::bench(types = [i32, &str], consts = [1, 5, 10])]
+    fn contextual_generic_type_const1<T, const N: usize>(_: Bencher) {}
+
+    #[divan::bench(types = [i32, &str], consts = CONST_VALUES)]
+    fn contextual_generic_type_const2<T, const N: usize>(_: Bencher) {}
+}
diff --git a/crates/divan_compat/divan_fork/tests/weird_usage.rs b/crates/divan_compat/divan_fork/tests/weird_usage.rs
new file mode 100644
index 00000000..28eb4af5
--- /dev/null
+++ b/crates/divan_compat/divan_fork/tests/weird_usage.rs
@@ -0,0 +1,136 @@
+// Tests that ensure weird (but valid) usage behave as expected.
+
+// Miri cannot discover benchmarks.
+#![cfg(not(miri))]
+
+use std::time::Duration;
+
+extern crate codspeed_divan_compat_walltime as divan;
+use divan::{Divan, __private::BENCH_ENTRIES};
+
+#[divan::bench(bytes_count = 0u8, chars_count = 0u16, cycles_count = 0u32, items_count = 0u64)]
+fn zero_throughput() {}
+
+#[divan::bench(min_time = Duration::ZERO)]
+fn min_min() {}
+
+#[divan::bench(max_time = Duration::MAX)]
+fn max_max() {}
+
+#[divan::bench]
+fn lifetime<'a>() -> &'a str {
+    "hello"
+}
+
+#[divan::bench]
+fn embedded() {
+    #[divan::bench]
+    fn inner() {
+        #[divan::bench]
+        fn inner() {}
+    }
+}
+
+#[divan::bench]
+fn r#raw_ident() {}
+
+#[divan::bench(r#name = "raw name ident")]
+fn raw_name_ident() {}
+
+#[divan::bench]
+extern "system" fn extern_abi_1() {}
+
+#[divan::bench]
+#[allow(improper_ctypes_definitions)]
+extern "C" fn extern_abi_2(_: divan::Bencher) {}
+
+#[divan::bench(types = [i32, u8])]
+extern "system" fn extern_abi_3<T>() {}
+
+#[divan::bench(r#types = [i32, u8])]
+#[allow(improper_ctypes_definitions)]
+extern "C" fn extern_abi_4<T>(_: divan::Bencher) {}
+
+#[divan::bench(consts = [0, -1, isize::MAX])]
+extern "system" fn extern_abi_5<const N: isize>() {}
+
+#[divan::bench(consts = [0, -1, isize::MAX])]
+#[allow(improper_ctypes_definitions)]
+extern "C" fn extern_abi_6<const N: isize>(_: divan::Bencher) {}
+
+macro_rules! consts {
+    () => {
+        [0, -1, isize::MAX]
+    };
+}
+
+#[divan::bench(consts = consts!())]
+fn bench_consts<const N: isize>() {}
+
+#[divan::bench(args = [])]
+fn empty_args(_: usize) {}
+
+#[divan::bench(types = [])]
+#[allow(dead_code)]
+fn empty_types<T>() {}
+
+#[divan::bench(consts = [])]
+#[allow(dead_code)]
+fn empty_consts<const C: usize>() {}
+
+#[divan::bench(args = [], consts = [])]
+#[allow(dead_code)]
+fn empty_args_consts<const C: usize>(_: usize) {}
+
+#[divan::bench(types = [], consts = [])]
+#[allow(dead_code)]
+fn empty_types_consts_1<T, const C: usize>() {}
+
+#[divan::bench(consts = [], types = [])]
+#[allow(dead_code)]
+fn empty_types_consts_2<T, const C: usize>() {}
+
+#[divan::bench(types = [], consts = [])]
+#[allow(dead_code)]
+fn empty_types_consts_3<const C: usize, T>() {}
+
+#[divan::bench(consts = [], types = [])]
+#[allow(dead_code)]
+fn empty_types_consts_4<const C: usize, T>() {}
+
+#[test]
+fn test_fn() {
+    Divan::default().test_benches();
+}
+
+// Test that each function appears the expected number of times.
+#[test]
+fn count() {
+    let mut inner_count = 0;
+
+    for entry in BENCH_ENTRIES.iter() {
+        if entry.meta.raw_name == "inner" {
+            inner_count += 1;
+        }
+    }
+
+    assert_eq!(inner_count, 2);
+}
+
+// Test expected `BenchEntry.path` values.
+#[test]
+fn path() {
+    for entry in BENCH_ENTRIES.iter() {
+        // Embedded functions do not contain their parent function's name in
+        // their `module_path!()`.
+        if entry.meta.raw_name == "inner" {
+            assert_eq!(entry.meta.module_path, "weird_usage");
+        }
+
+        // "r#" is removed from raw identifiers.
+        if entry.meta.raw_name.contains("raw_ident") {
+            assert_eq!(entry.meta.raw_name, "r#raw_ident");
+            assert_eq!(entry.meta.display_name, "raw_ident");
+        }
+    }
+}