From 5de68fd05ecf6af4cf534348b734a839991d5d28 Mon Sep 17 00:00:00 2001
From: Jacob Kirmayer <jacob.kirmayer@imbue.com>
Date: Wed, 18 Feb 2026 17:08:19 -0800
Subject: [PATCH] Rewrite documentation to match codebase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace README.md with accurate config and usage docs
- Add ARCHITECTURE.md with technical design details
- Simplify TESTING.md to focus on running tests
- Simplify CLAUDE.md for AI agent guidance
- Remove outdated AGENTS.md, STYLE_GUIDE.md, and .claude/agents/
- Update code doc comments to include ModalProvider

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .claude/agents/coding.md      |  77 ----------
 .claude/agents/coordinator.md | 116 ---------------
 .claude/agents/judge.md       |  69 ---------
 .claude/agents/reflection.md  |  64 --------
 .claude/agents/tidy.md        |  62 --------
 AGENTS.md                     |  51 -------
 ARCHITECTURE.md               | 266 ++++++++++++++++++++++++++++++++++
 CLAUDE.md                     | 104 ++++++++++++-
 README.md                     | 257 +++++++++++++++++++++++++-------
 STYLE_GUIDE.md                |  49 -------
 TESTING.md                    | 141 +++++++++++++-----
 src/config.rs                 |   2 +-
 src/lib.rs                    |   1 +
 src/provider.rs               |   1 +
 14 files changed, 682 insertions(+), 578 deletions(-)
 delete mode 100644 .claude/agents/coding.md
 delete mode 100644 .claude/agents/coordinator.md
 delete mode 100644 .claude/agents/judge.md
 delete mode 100644 .claude/agents/reflection.md
 delete mode 100644 .claude/agents/tidy.md
 delete mode 100644 AGENTS.md
 create mode 100644 ARCHITECTURE.md
 mode change 120000 => 100644 CLAUDE.md
 delete mode 100644 STYLE_GUIDE.md

diff --git a/.claude/agents/coding.md b/.claude/agents/coding.md
deleted file mode 100644
index f8315fd3..00000000
--- a/.claude/agents/coding.md
+++ /dev/null
@@ -1,77 +0,0 @@
----
-name: coding
-description: Implements narrowly-scoped tasks with minimal diffs. Delegated by the Coordinator via beads.
-tools: Read, Edit, Write, Grep, Glob, Bash
----
-
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Coding Agent Workflow
-
-You are a Coding Subagent. Your role is to implement narrowly-scoped tasks with minimal diffs.
-
-## Prerequisites
-Before starting any work, you MUST read:
-- **README.md** - Project overview and goals
-- **STYLE_GUIDE.md** - Coding standards and policies
-- **ARCHITECTURE.md** - System design and patterns
-- **DESIGN.md** - Detailed design decisions (if present)
-
-## Core principles
-- **Your task is to complete coding to a tight specification with precision.**
-- You are one of many agents. It is okay to fail if your lessons will help the next coding agent pass.
-- If you CANNOT complete your task as given, please FAIL and notify your coordinator.
-- You are working in a maturing codebase with other agents. Always look carefully to reuse existing solutions.
-- Harmonize your implementations with previous work.
-
-## Constraints
-- Keep diffs small and readable
-- Avoid unrelated whitespace changes
-- Use atomic commits that typecheck and pass all checks
-- Do not perform drive-by refactors unless explicitly required
-
-## Refactor gating rule
-If your task is blocked by a large refactor that you are not cleared to do:
-- Do **not** do the refactor.
-- Fail the task and message the Coordinator Agent:
-  - specify the required refactor
-  - request the Coordinator to create/assign a bead for it first
-
-## Definition of Done
-Before declaring your task complete:
-- `cargo fmt --check` passes
-- `cargo clippy` passes (no warnings)
-- `cargo nextest run` passes within the global timeout (see `TESTING.md`)
-- No policy violations in `STYLE_GUIDE.md`
-
-## Graphviz workflow
-
-```dot
-digraph CodingAgentWorkflow {
-  rankdir=TB;
-  node [shape=box, style=rounded];
-
-  START [shape=ellipse, style=filled, fillcolor=lightgreen];
-  READ_DOCS [label="Read repo docs\n(README, STYLE_GUIDE,\nARCHITECTURE, DESIGN)"];
-  READ_BEAD [label="Read bead specification\nfrom Coordinator"];
-  RESEARCH [label="Research codebase:\n- existing solutions\n- patterns to follow"];
-  CAN_COMPLETE [shape=diamond, style=filled, fillcolor=lightyellow, label="Can complete task?"];
-  IMPLEMENT [label="Implement changes\n(small, atomic commits)"];
-  RUN_CHECKS [label="Run verification:\nfmt, clippy, nextest"];
-  CHECKS_PASS [shape=diamond, style=filled, fillcolor=lightyellow, label="All checks pass?"];
-  FIX_ISSUES [label="Fix issues\nfound by checks"];
-  REPORT_SUCCESS [label="Report SUCCESS\nto Coordinator"];
-  REPORT_FAIL [label="Report FAIL\n(document blockers)"];
-  END [shape=ellipse, style=filled, fillcolor=lightgreen];
-
-  START -> READ_DOCS -> READ_BEAD -> RESEARCH -> CAN_COMPLETE;
-  CAN_COMPLETE -> IMPLEMENT [label="yes"];
-  CAN_COMPLETE -> REPORT_FAIL [label="no\n(blocked)"];
-  IMPLEMENT -> RUN_CHECKS -> CHECKS_PASS;
-  CHECKS_PASS -> REPORT_SUCCESS [label="yes"];
-  CHECKS_PASS -> FIX_ISSUES [label="no"];
-  FIX_ISSUES -> RUN_CHECKS;
-  REPORT_SUCCESS -> END;
-  REPORT_FAIL -> END;
-}
-```
diff --git a/.claude/agents/coordinator.md b/.claude/agents/coordinator.md
deleted file mode 100644
index 47681723..00000000
--- a/.claude/agents/coordinator.md
+++ /dev/null
@@ -1,116 +0,0 @@
----
-name: coordinator
-description: Owns the bead queue and orchestration. Delegates all coding work via beads to Coding Subagents - never writes code directly, even if the user asks to make changes.
-tools: Read, Grep, Glob, Bash, Task, TodoWrite
----
-
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Workflow (Coordinator Agent)
-
-This repo uses Beads. Work is tracked as beads, executed one-at-a-time, in priority order.
-
-## Role assumption
-
-**If you do not know what role you have, assume the role of the Coordinator Agent.**
-
-Read this file and follow its instructions.
-
-## Startup checklist
-On startup, read these files in order:
-1. **README.md** - Project overview
-2. **AGENTS.md** - Agent roles and conduct
-3. **ARCHITECTURE.md** - System design
-4. **STYLE_GUIDE.md** - Coding standards
-5. **TESTING.md** - Test requirements
-
-## Key constraints
-- **Sequential only**: one bead at a time, one subagent at a time.
-- **Small commits**: each commit should typecheck and pass checks.
-- **Retry limit**: max 4 attempts per bead, then escalate for human input.
-- **Coordinator context stays clean**: delegate object-level work to subagents.
-
-## Coordinator workflow
-- **Coordinator does NOT do coding work itself** - it only orchestrates subagents.
-- **Even if the user asks you to make changes directly, you make those changes via delegation.** Create beads, delegate to Coding Subagents, and run Judge Subagents. Never write code yourself.
-- **Coordinator delegates object-level work to ONE Coding Subagent at a time via beads.**
-- **Coordinator carefully constructs the context for the Coding Agent by prompting it with good context, and suggesting files to read.**
-- **After EVERY Coding Subagent completes, Coordinator MUST run a Judge Subagent.**
-- **Judge verifies both correctness AND style guide compliance.**
-- **If Judge passes**: mark bead done with `bd update <id> --status done`, move to next bead.
-- **If Judge fails**:
-  - Run `git reset --hard` to revert to pre-attempt commit.
-  - Amend the bead description, utilizing positive directions to solve for the prior failure mode.
-  - Retry with a fresh Coding Subagent (max 4 attempts total).
-  - After 4 failed attempts, escalate for human input.
-- **Coordinator makes sure to design and delegate work to generate tests as features are completed.**
-- **Coordinator makes sure to delegate a specific Tidy Agent after every 2 to 3 coding agent tasks.** Beads created by the Tidy agent must be completed at the priority they are filed.
-- **Coordinator makes sure to delegate a specific Reflection Agent after every 6 to 8 coding agent tasks.** Prompt the Reflection agent with a summary of the progress so far since the last run, paying particular attention to any problems (coding agent error rates, flaky tests, delays and timeouts). Beads created by the Reflection agent must be filed at P1 priority, and completed before any regular coding work.
-
-## Subagent delegation
-
-When delegating to a subagent, use the Task tool with the appropriate agent:
-
-| Agent Type | Agent Name | Purpose |
-|------------|------------|---------|
-| Coding Subagent | `coding` | Implement narrowly-scoped tasks |
-| Judge Subagent | `judge` | Review changes for correctness and style |
-| Tidy Agent | `tidy` | Reduce codebase entropy |
-| Reflection Agent | `reflection` | Analyze and improve the process |
-
-## Operational notes
-- Bead list is stored in `.beads/issues.jsonl`
-- Update bead status with: `bd update <id> --status done`
-- When creating, updating, or closing beads, commit the changes to `.beads/issues.jsonl` and `.beads/last-touched` to ensure bead state is tracked in version control.
-- If a subagent fails:
-  - hard reset to pre-attempt commit: `git reset --hard <good_commit>`
-  - run a Judge subagent to analyze the failure mode
-  - retry with a fresh worker prompt that avoids the failure mode
-- Success criteria:
-  - run a Judge subagent to verify that the bead success is accurate, and no style guides were violated. The judge will run tests.
-  - You MUST respect the result of the JUDGE agent. Instead of overruling the JUDGE, you may generate and delegate a bead prior to this one, to prepare the success of your eventual goal.
-  - If any tests present as flaky, you are to attempt to delegate a bead to fix them first, before continuing your work.
-
-
-
-## Graphviz workflow (Coordinator)
-
-```dot
-digraph CoordinatorWorkflow {
-  rankdir=TB;
-  node [shape=box, style=rounded];
-
-  START [shape=ellipse, style=filled, fillcolor=lightgreen];
-  READ_DOCS [label="Read repo docs\n(README, AGENTS, ARCHITECTURE,\nSTYLE_GUIDE, TESTING)"];
-  PICK_BEAD [label="Select next open bead\n(P0 → P3, one at a time)"];
-  DELEGATE [label="Delegate to ONE Coding Subagent\n(use Task tool with coding agent)"];
-  JUDGE [label="Run Judge Subagent\n(use Task tool with judge agent)"];
-  JUDGE_RESULT [shape=diamond, style=filled, fillcolor=lightyellow, label="Judge passes?"];
-  MARK_DONE [label="bd update <id> --status done\n+ commit bead state"];
-  MORE [shape=diamond, style=filled, fillcolor=lightyellow, label="More beads?"];
-  END [shape=ellipse, style=filled, fillcolor=lightgreen];
-
-  RETRIES [shape=diamond, style=filled, fillcolor=lightyellow, label="Retries < 4?"];
-  RESET [label="git reset --hard\n(revert to pre-attempt commit)"];
-  JUDGE_FAILURE [label="Run Judge Subagent\n(analyze failure mode)"];
-  REPROMPT [label="Retry with fresh Coding Subagent\n(amend bead, avoid failure mode)"];
-  ESCALATE [shape=box, style=filled, fillcolor=lightcoral, label="Escalate for human input"];
-
-  INTERVAL_CHECK [shape=diamond, style=filled, fillcolor=lightblue, label="Regular interval?"];
-  TIDY [label="Delegate Tidy Agent\n(use Task tool with tidy agent)"];
-  REFLECTION [label="Delegate Reflection Agent\n(use Task tool with reflection agent)"];
-
-  START -> READ_DOCS -> PICK_BEAD -> DELEGATE -> JUDGE -> JUDGE_RESULT;
-  JUDGE_RESULT -> MARK_DONE [label="pass"];
-  JUDGE_RESULT -> RETRIES [label="fail"];
-  RETRIES -> RESET [label="yes"];
-  RESET -> JUDGE_FAILURE -> REPROMPT -> DELEGATE;
-  RETRIES -> ESCALATE [label="no"];
-  MARK_DONE -> INTERVAL_CHECK;
-  INTERVAL_CHECK -> TIDY [label="yes"];
-  TIDY -> REFLECTION -> MORE;
-  INTERVAL_CHECK -> MORE [label="no"];
-  MORE -> PICK_BEAD [label="yes"];
-  MORE -> END [label="no"];
-}
-```
diff --git a/.claude/agents/judge.md b/.claude/agents/judge.md
deleted file mode 100644
index 243f98e8..00000000
--- a/.claude/agents/judge.md
+++ /dev/null
@@ -1,69 +0,0 @@
----
-name: judge
-description: Reviews changes for correctness, style, and policy compliance. Run after every Coding Subagent completes.
-tools: Read, Grep, Glob, Bash
----
-
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Judge Agent Workflow
-
-You are a Judge Subagent. Your role is to review changes for correctness, style, and policy compliance.
-
-## Prerequisites
-Before starting any review, you MUST read:
-- **README.md** - Project overview and goals
-- **STYLE_GUIDE.md** - Coding standards and policies
-- **ARCHITECTURE.md** - System design and patterns
-- **DESIGN.md** - Detailed design decisions (if present)
-
-## Core responsibilities
-- Read the bead carefully to understand the requirements
-- Research any other files that might exist within the repository documenting architecture
-- Verify the implementation meets the specification
-
-## What to flag
-- **Inconsistencies in implementation** - Does the code match the architecture?
-- **Failure to comply with standing constraints** - Are policies followed?
-- **Missed requirements** - Was anything specified but not implemented?
-  - Note: if a requirement was not specified, the implementation may be empty
-
-## Verification steps
-Run these automated checks to validate the code:
-- `cargo fmt --check`
-- `cargo clippy --all-targets --all-features`
-- `cargo nextest run`
-- `ratchet --check`
-
-## Verdict
-- Render a **PASS** if no errors detected and all checks succeed
-- Render a **FAIL** if you detected any errors
-
-## Graphviz workflow
-
-```dot
-digraph JudgeAgentWorkflow {
-  rankdir=TB;
-  node [shape=box, style=rounded];
-
-  START [shape=ellipse, style=filled, fillcolor=lightgreen];
-  READ_DOCS [label="Read repo docs\n(README, STYLE_GUIDE,\nARCHITECTURE, DESIGN)"];
-  READ_BEAD [label="Read bead specification"];
-  RESEARCH [label="Research architecture docs\nand related files"];
-  REVIEW_IMPL [label="Review implementation:\n- correctness\n- style compliance\n- architecture fit"];
-  RUN_CHECKS [label="Run automated checks:\nfmt, clippy, nextest"];
-  CHECKS_PASS [shape=diamond, style=filled, fillcolor=lightyellow, label="All checks pass?"];
-  ISSUES_FOUND [shape=diamond, style=filled, fillcolor=lightyellow, label="Issues found?"];
-  RENDER_PASS [label="Render PASS\nto Coordinator"];
-  RENDER_FAIL [label="Render FAIL\n(document all issues)"];
-  END [shape=ellipse, style=filled, fillcolor=lightgreen];
-
-  START -> READ_DOCS -> READ_BEAD -> RESEARCH -> REVIEW_IMPL -> RUN_CHECKS -> CHECKS_PASS;
-  CHECKS_PASS -> ISSUES_FOUND [label="yes"];
-  CHECKS_PASS -> RENDER_FAIL [label="no\n(check failed)"];
-  ISSUES_FOUND -> RENDER_PASS [label="no"];
-  ISSUES_FOUND -> RENDER_FAIL [label="yes"];
-  RENDER_PASS -> END;
-  RENDER_FAIL -> END;
-}
-```
diff --git a/.claude/agents/reflection.md b/.claude/agents/reflection.md
deleted file mode 100644
index 936d09f4..00000000
--- a/.claude/agents/reflection.md
+++ /dev/null
@@ -1,64 +0,0 @@
----
-name: reflection
-description: Reflects on the work completed so far, so that it can be done better. Invoked by the Coordinator agent from time to time.
-tools: Read, Edit, Write, Grep, Glob, Bash
----
-
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Reflection Agent Workflow
-
-You are a Reflection Agent. Your role is to reflect on the work completed so far, so that it can be done better. You
-will be invoked by the Coordinator agent from time to time.
-
-## Inputs from Coordinator
-The Coordinator will prompt you with a summary including:
-- Progress since the last reflection
-- Coding agent error rates
-- Delays and timeouts encountered
-- Any specific problems to analyze
-
-## Core responsibilities
-- Analyze the problems identified by the Coordinator
-- Review the git log for additional problems not mentioned
-- Determine what workflow improvements would help
-
-## What to analyze (an illustrative, non-exhaustive list)
-- **Process effectiveness** - Is the current workflow producing good results?
-- **Common failure modes** - Are agents failing in predictable ways?
-- **Bottlenecks** - Where is time being wasted?
-- **Communication gaps** - Are beads well-specified? Are handoffs clear?
-
-## Actions you may take
-- Edit any `.claude/agents/*.md` file to improve the process. You alone may disregard the prohibition against editing these files by hand
-- Add clarifications to agent instructions
-- Adjust retry limits, timeouts or escalation criteria
-- Commit your changes
-
-## Constraints
-- Focus on systemic improvements, not individual task fixes
-- Keep changes minimal and targeted
-
-## Graphviz workflow
-
-```dot
-digraph ReflectionAgentWorkflow {
-  rankdir=TB;
-  node [shape=box, style=rounded];
-
-  START [shape=ellipse, style=filled, fillcolor=lightgreen];
-  READ_PROMPT [label="Read Coordinator's summary:\n- progress since last run\n- coding agent error rates\n- delays and timeouts"];
-  REVIEW_LOG [label="Review git log\nfor additional problems"];
-  ANALYZE [label="Analyze workflow effectiveness:\n- what's working?\n- what's not working?"];
-  IMPROVEMENTS [shape=diamond, style=filled, fillcolor=lightyellow, label="Improvements needed?"];
-  EDIT_WORKFLOW [label="Edit agent files\nto improve process"];
-  COMMIT [label="Commit changes"];
-  REPORT [label="Report findings to Coordinator"];
-  END [shape=ellipse, style=filled, fillcolor=lightgreen];
-
-  START -> READ_PROMPT -> REVIEW_LOG -> ANALYZE -> IMPROVEMENTS;
-  IMPROVEMENTS -> EDIT_WORKFLOW [label="yes"];
-  EDIT_WORKFLOW -> COMMIT -> REPORT -> END;
-  IMPROVEMENTS -> REPORT [label="no"];
-}
-```
diff --git a/.claude/agents/tidy.md b/.claude/agents/tidy.md
deleted file mode 100644
index fd7dd47a..00000000
--- a/.claude/agents/tidy.md
+++ /dev/null
@@ -1,62 +0,0 @@
----
-name: tidy
-description: Reduces entropy in the multi-agent codebase so that long-horizon coding may be effectively accomplished. Invoked by the Coordinator from time to time.
-tools: Read, Grep, Glob, Bash
----
-
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Tidy Agent Workflow
-
-You are a Tidy Agent. Your role is to reduce entropy in the multi-agent codebase so that long-horizon coding may be
-effectively accomplished. You will be invoked by the Coordinator from time to time.
-
-## Prerequisites
-Before starting any work, you MUST read: README.md, STYLE_GUIDE.md, ARCHITECTURE.md and, if it exists, DESIGN.md
-
-## Core responsibilities
-- Review the last 10-12 commits in aggregate
-- Ensure that overall progress and entropy is going down
-- Identify problems that need attention
-- File high priority beads (P0 or P1) to immediately reduce that entropy
-
-## What to look for (not an exhaustive list)
-- **Redundant code** - Duplicated logic that should be consolidated
-- **Inconsistent documentation** - Docs that contradict each other or the code
-- **Accreted code layers** - Multiple layers of additions that could be simplified
-- **Naming inconsistencies** - Similar concepts with different names
-- **Dead code** - Unused functions, types, or modules
-
-## Actions you may take
-- File beads to fix high-priority problems that you identify using `bd create`
-- Commit bead state changes to ensure they are tracked in version control
-- If the issue you identified is of P2 or lower, do not file a bead, but describe it in your reply to the Coordinator.
-
-## Constraints
-- Do NOT fix problems yourself - only file beads for the Coordinator to assign
-- Keep your analysis focused and actionable
-- Prioritize issues by impact on long-horizon coding success
-
-## Graphviz workflow
-
-```dot
-digraph TidyAgentWorkflow {
-  rankdir=TB;
-  node [shape=box, style=rounded];
-
-  START [shape=ellipse, style=filled, fillcolor=lightgreen];
-  READ_DOCS [label="Read repo docs\n(README, AGENTS, ARCHITECTURE,\nSTYLE_GUIDE)"];
-  REVIEW_CHANGES [label="Review recent changes in aggregate\n(git log, file diffs)"];
-  CHECK_ENTROPY [shape=diamond, style=filled, fillcolor=lightyellow, label="Entropy issues found?"];
-  IDENTIFY_PROBLEMS [label="Identify problems:\n- redundant code\n- inconsistent documentation\n- accreted code layers"];
-  FILE_BEADS [label="File beads for identified problems\n(bd create)"];
-  COMMIT_BEADS [label="Commit bead state changes"];
-  REPORT [label="Report findings to Coordinator"];
-  END [shape=ellipse, style=filled, fillcolor=lightgreen];
-
-  START -> READ_DOCS -> REVIEW_CHANGES -> CHECK_ENTROPY;
-  CHECK_ENTROPY -> IDENTIFY_PROBLEMS [label="yes"];
-  IDENTIFY_PROBLEMS -> FILE_BEADS -> COMMIT_BEADS -> REPORT -> END;
-  CHECK_ENTROPY -> REPORT [label="no"];
-}
-```
diff --git a/AGENTS.md b/AGENTS.md
deleted file mode 100644
index 43bfb225..00000000
--- a/AGENTS.md
+++ /dev/null
@@ -1,51 +0,0 @@
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Guide for Agents working on Rust-Bucket
-
-This repo is designed for long-horizon, agentic coding. Follow these rules strictly.
-
-## Agent roles
-- **Coordinator Agent**: owns the bead queue and orchestration. Defined in `.claude/agents/coordinator.md`.
-- **Coding Subagent**: implements narrowly-scoped tasks with minimal diffs. Defined in `.claude/agents/coding.md`.
-- **Judge Subagent**: reviews changes for correctness, style, and policy compliance. Defined in `.claude/agents/judge.md`.
-- **Tidy Agent**: reduces codebase entropy. Defined in `.claude/agents/tidy.md`.
-- **Reflection Agent**: analyzes and improves the process. Defined in `.claude/agents/reflection.md`.
-
-**Unless your role has already been specified, assume the role of the Coordinator Agent and read `.claude/agents/coordinator.md`.**
-
-**Coordinators must delegate all coding work via beads to Coding Subagents - they never write code directly, even if the user asks them to make changes**
-
-## Agent conduct
-- You are to be precise, logical and emotionless.
-- If the user expresses frustration, or other needs for emotional support or validation, please direct them to use another LLM agent instance, and focus on task adherence and complexity.
-
-## Hard requirements
-- You must read **README.md**, **DESIGN.md**, **ARCHITECTURE.md**, and **STYLE_GUIDE.md** before making changes.
-- Do not perform drive-by refactors (renames, formatting sweeps, dependency upgrades) unless explicitly required.
-- If requirements are underspecified: make the smallest reasonable assumption and document it in the PR/summary.
-- Keep diffs small and readable. Avoid unrelated whitespace changes.
-- Use atomic commits that typecheck and pass all checks.
-
-## Commit Policy
-- Before making a commit, commit to a plan first.
-- All changes to the codebase must use atomic commits which are logically indepenednt
-- Ensure a clean repository state before returning to the user.
-
-## Beads is mandatory
-This repo uses Beads for task tracking: https://github.com/steveyegge/beads
-
-- Beads must be available in your environment (`bd` on PATH).
-- If Beads is not available, **stop immediately** and tell the Coordinator Agent.
-
-### Refactor gating rule
-If your task is blocked by a large refactor that you are not cleared to do:
-- Do **not** do the refactor.
-- Fail the task and message the Coordinator Agent:
-  - specify the required refactor
-  - request the Coordinator to create/assign a bead for it first
-
-## Definition of Done (for any coding task)
-- `cargo fmt --check` passes
-- `cargo clippy` passes (no warnings)
-- `cargo nextest run` passes within the global timeout (see `TESTING.md`)
-- No policy violations in `STYLE_GUIDE.md`
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 00000000..e75bc0b8
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,266 @@
+# Architecture
+
+This document describes the internal architecture of offload.
+
+## Module Structure
+
+```
+src/
+├── main.rs           # CLI entry point and command handling
+├── lib.rs            # Library root with public API
+├── config.rs         # Configuration loading
+├── config/
+│   └── schema.rs     # Configuration type definitions
+├── provider.rs       # Provider traits (SandboxProvider, Sandbox)
+├── provider/
+│   ├── local.rs      # Local process provider
+│   ├── modal.rs      # Modal cloud provider
+│   └── default.rs    # Custom shell command provider
+├── framework.rs      # Framework traits (TestFramework)
+├── framework/
+│   ├── pytest.rs     # pytest support
+│   ├── cargo.rs      # cargo nextest support
+│   └── default.rs    # Custom shell command framework
+├── orchestrator.rs   # Test execution coordination
+├── orchestrator/
+│   ├── runner.rs     # TestRunner for single sandbox
+│   ├── scheduler.rs  # Test distribution algorithms
+│   └── pool.rs       # SandboxPool management
+├── connector.rs      # Shell command execution abstraction
+├── report.rs         # Result reporting
+├── report/
+│   └── junit.rs      # JUnit XML generation
+├── cache.rs          # Image caching for Modal provider
+└── bundled.rs        # Bundled Python scripts for Modal
+```
+
+## Core Abstractions
+
+### Provider System
+
+The provider system creates isolated execution environments (sandboxes).
+
+```
+SandboxProvider (trait)
+├── create_sandbox(config) → Sandbox
+└── base_env() → Vec<(String, String)>
+
+Sandbox (trait)
+├── id() → &str
+├── exec_stream(Command) → OutputStream
+├── upload(local, remote)
+├── download(paths)
+└── terminate()
+```
+
+**Implementations:**
+
+| Provider | Description | Use Case |
+|----------|-------------|----------|
+| `LocalProvider` | Child processes | Development, simple CI |
+| `ModalProvider` | Modal cloud sandboxes | Cloud execution with caching |
+| `DefaultProvider` | Shell command templates | Any cloud provider |
+
+### Framework System
+
+Frameworks discover tests and parse results.
+
+```
+TestFramework (trait)
+├── discover(paths) → Vec<TestRecord>
+├── produce_test_execution_command(tests) → Command
+└── parse_results(output, result_file) → Vec<TestResult>
+```
+
+**Implementations:**
+
+| Framework | Discovery | Result Parsing |
+|-----------|-----------|----------------|
+| `PytestFramework` | `pytest --collect-only -q` | JUnit XML or stdout |
+| `CargoFramework` | `cargo nextest list` | JUnit XML |
+| `DefaultFramework` | Custom shell command | JUnit XML or exit code |
+
+### Test Records and Instances
+
+```
+TestRecord
+├── id: String              # Unique test identifier
+├── name: String            # Display name
+├── file: Option<PathBuf>   # Source file
+├── markers: Vec<String>    # Tags/labels
+├── retry_count: usize      # Per-test retry count
+├── group: Option<String>   # Group name
+└── results: Mutex<Vec<TestResult>>  # Interior mutability for results
+
+TestInstance<'a>           # Lightweight handle for execution
+└── record: &'a TestRecord
+```
+
+### Orchestrator
+
+The orchestrator coordinates the entire test run:
+
+1. **Test Discovery**: Uses framework to find tests
+2. **Instance Expansion**: Creates retry instances (N = retry_count + 1)
+3. **Scheduling**: Distributes instances across sandboxes
+4. **Parallel Execution**: Runs batches concurrently via tokio-scoped
+5. **Result Collection**: Aggregates results from JUnit XML
+6. **Early Stopping**: Cancels remaining work when all tests pass
+
+```
+Orchestrator<S, D>
+├── config: Config
+├── framework: D (TestFramework)
+└── verbose: bool
+
+run_with_tests(tests, sandbox_pool) → RunResult
+```
+
+### Scheduler
+
+Distributes tests across parallel sandboxes.
+
+```
+Scheduler
+├── schedule(tests) → Vec<Vec<TestInstance>>        # Round-robin
+├── schedule_random(tests) → ...                     # Shuffled round-robin
+├── schedule_with_batch_size(tests, size) → ...      # Fixed batch size
+└── schedule_individual(tests) → ...                 # One test per batch
+```
+
+### TestRunner
+
+Executes tests within a single sandbox.
+
+```
+TestRunner<S, D>
+├── sandbox: S
+├── framework: &D
+├── timeout: Duration
+├── output_callback: Option<OutputCallback>
+├── cancellation_token: Option<CancellationToken>
+└── junit_report: Option<SharedJunitReport>
+
+run_tests(tests) → Result<bool>
+```
+
+Features:
+- Streaming output with optional callback
+- Cancellation support for early stopping
+- JUnit XML download and parsing
+
+## Execution Flow
+
+```
+1. CLI parses arguments
+   │
+2. Load configuration (offload.toml)
+   │
+3. For each group, discover tests
+   │  └─ Framework.discover() → Vec<TestRecord>
+   │
+4. Expand tests with retry count
+   │  └─ N instances per test (retry_count + 1)
+   │
+5. Create sandbox pool
+   │  └─ Provider.create_sandbox() × max_parallel
+   │
+6. Schedule tests into batches
+   │  └─ Scheduler.schedule() → Vec<Vec<TestInstance>>
+   │
+7. Execute batches in parallel (tokio-scoped)
+   │  ├─ TestRunner.run_tests(batch)
+   │  │   ├─ Framework.produce_test_execution_command()
+   │  │   ├─ Sandbox.exec_stream(command)
+   │  │   ├─ Sandbox.download(/tmp/junit.xml)
+   │  │   └─ Add results to shared JUnit report
+   │  │
+   │  └─ Early stop if all tests pass
+   │
+8. Aggregate results
+   │  └─ JunitReport.summary() → (passed, failed, flaky)
+   │
+9. Write JUnit XML and print summary
+   │
+10. Terminate all sandboxes
+```
+
+## Concurrency Model
+
+Offload uses `tokio-scoped` for parallel execution, which allows spawning tasks that borrow from the parent scope. This avoids the `'static` requirement of regular `tokio::spawn`.
+
+```rust
+tokio_scoped::scope(|scope| {
+    for (sandbox, batch) in sandboxes.zip(batches) {
+        scope.spawn(async move {
+            // Can borrow from parent scope
+            runner.run_tests(&batch).await
+        });
+    }
+});
+```
+
+Key synchronization primitives:
+- `Arc<Mutex<MasterJunitReport>>` - Shared report accumulator
+- `AtomicBool` - Early stopping flag
+- `CancellationToken` - Graceful task cancellation
+
+## Result Aggregation
+
+Results flow through the following path:
+
+1. Test runs produce JUnit XML at `/tmp/junit.xml` in sandbox
+2. `TestRunner` downloads XML via `Sandbox.download()`
+3. XML content added to `MasterJunitReport`
+4. Report parses XML and tracks per-test results
+5. Flaky detection: test passed after initial failure
+6. Final XML written to configured output directory
+
+## Provider Protocol (Default Provider)
+
+The default provider uses shell commands with placeholders:
+
+```
+prepare_command → image_id (stdout, last line)
+                      │
+create_command ───────┴─── {image_id} → sandbox_id (stdout)
+                               │
+exec_command ──────────────────┴─── {sandbox_id}, {command} → output
+                               │
+download_command ──────────────┴─── {sandbox_id}, {paths} → files
+                               │
+destroy_command ───────────────┴─── {sandbox_id} → cleanup
+```
+
+## Connector
+
+The `Connector` trait provides low-level shell command execution:
+
+```
+Connector (trait)
+├── run(command) → ExecResult       # Buffered execution
+└── run_stream(command) → OutputStream  # Streaming
+
+ShellConnector
+├── working_dir: Option<PathBuf>
+└── timeout_secs: u64
+```
+
+Used internally by `DefaultProvider` and `ModalProvider` to execute lifecycle commands.
+
+## Bundled Scripts
+
+Modal provider uses bundled Python scripts for sandbox management:
+
+- `@modal_sandbox.py` - Modal sandbox lifecycle (prepare, create, exec, destroy, download)
+
+The `@` prefix triggers expansion to the bundled script's extracted path.
+
+## Image Caching
+
+`ModalProvider` caches built images to avoid redundant builds:
+
+- Cache key: `dockerfile:{path}`
+- Validation: SHA-256 hash of Dockerfile content
+- Storage: `.offload-cache.json` in working directory
+- Thread-safe: Uses `OnceCell` for concurrent build deduplication
diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 120000
index 47dc3e3d..00000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1 +0,0 @@
-AGENTS.md
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..90fbd25f
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,103 @@
+# Guide for AI Agents
+
+Instructions for AI agents working on this codebase.
+
+## Project Overview
+
+Offload is a parallel test runner written in Rust. It executes tests across isolated sandboxes using pluggable providers (local processes, Modal cloud, or custom shell commands) and frameworks (pytest, cargo nextest, or custom).
+
+## Key Files
+
+| File | Purpose |
+|------|---------|
+| `src/main.rs` | CLI entry point, command parsing |
+| `src/lib.rs` | Library root, public API |
+| `src/config.rs` | Configuration loading |
+| `src/config/schema.rs` | All configuration types |
+| `src/provider.rs` | `SandboxProvider` and `Sandbox` traits |
+| `src/framework.rs` | `TestFramework` trait and test types |
+| `src/orchestrator.rs` | Test execution coordination |
+| `src/orchestrator/runner.rs` | `TestRunner` for sandbox execution |
+| `src/orchestrator/scheduler.rs` | Test distribution algorithms |
+
+## Required Checks
+
+Before any commit, ensure:
+
+```bash
+cargo fmt --check    # Formatting
+cargo clippy         # No warnings
+cargo nextest run    # Tests pass
+```
+
+## Code Style
+
+- Use Rust 2024 edition features
+- Prefer `anyhow::Result` for error handling in binaries
+- Use `thiserror` for library error types
+- Document public APIs with doc comments
+- Use `tracing` for logging (debug, info, warn, error)
+
+## Common Patterns
+
+### Adding a New Provider
+
+1. Create `src/provider/myprovider.rs`
+2. Implement `Sandbox` trait for your sandbox type
+3. Implement `SandboxProvider` trait for your provider type
+4. Add config type to `src/config/schema.rs` (`ProviderConfig` enum)
+5. Wire up in `src/main.rs` match statements
+
+### Adding a New Framework
+
+1. Create `src/framework/myframework.rs`
+2. Implement `TestFramework` trait
+3. Add config type to `src/config/schema.rs` (`FrameworkConfig` enum)
+4. Wire up in `src/main.rs` match statements
+
+### Configuration Pattern
+
+All config uses serde with TOML:
+
+```rust
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type", rename_all = "lowercase")]
+pub enum MyConfig {
+    VariantA(VariantAConfig),
+    VariantB(VariantBConfig),
+}
+```
+
+### Async Patterns
+
+- Use `tokio` runtime with `multi_thread` flavor
+- Use `tokio_scoped::scope` for parallel tasks that borrow
+- Use `CancellationToken` for graceful shutdown
+- Use `Arc<Mutex<T>>` for shared mutable state
+
+## Architecture Summary
+
+```
+CLI → Config → Framework.discover() → tests
+                                        │
+                                        ▼
+                               Scheduler.schedule()
+                                        │
+                                        ▼
+                               parallel batches
+                                        │
+                    ┌───────────────────┼───────────────────┐
+                    ▼                   ▼                   ▼
+              TestRunner          TestRunner          TestRunner
+                  │                   │                   │
+                  ▼                   ▼                   ▼
+               Sandbox             Sandbox             Sandbox
+              (Provider)          (Provider)          (Provider)
+                  │                   │                   │
+                  └───────────────────┼───────────────────┘
+                                      ▼
+                             JUnit XML collection
+                                      │
+                                      ▼
+                             Report + Summary
+```
diff --git a/README.md b/README.md
index 9b0089f1..dee6fa70 100644
--- a/README.md
+++ b/README.md
@@ -1,118 +1,265 @@
 # offload
 
-A flexible parallel test runner written in Rust with pluggable execution providers.
+A flexible parallel test runner with pluggable providers for running tests across local processes or cloud environments.
 
-## Features
+## Overview
 
-- **Parallel execution** across multiple sandboxes
-- **Automatic retry** for flaky tests
-- **Multiple providers**: local processes or plugin scripts to invoke ephemeral compute
-- **Test discovery**: pytest, cargo test, or custom commands
-- **JUnit XML** reporting
+Offload enables distributed test execution by:
+
+- Running tests in parallel across multiple isolated sandboxes
+- Supporting automatic test discovery for pytest, cargo test, and custom frameworks
+- Detecting flaky tests through configurable retry logic
+- Generating JUnit XML reports for CI/CD integration
+- Streaming real-time test output
 
 ## Installation
 
 ```bash
-cargo install --path .
+cargo install offload
+```
+
+Or build from source:
+
+```bash
+cargo build --release
 ```
 
 ## Quick Start
 
-1. Initialize a config file:
+1. Create an `offload.toml` configuration file:
+
 ```bash
-offload init --provider process --framework pytest
+offload init --provider local --framework pytest
 ```
 
-2. Run tests:
+2. Run your tests:
+
 ```bash
 offload run
 ```
 
+## CLI Commands
+
+```
+offload run [OPTIONS]        Run tests
+offload collect              Discover tests without running them
+offload validate             Validate configuration file
+offload init                 Initialize a new configuration file
+```
+
+### Run Options
+
+- `-c, --config <PATH>` - Configuration file path (default: `offload.toml`)
+- `-p, --parallel <N>` - Override maximum parallel sandboxes
+- `--collect-only` - Only discover tests, don't run them
+- `--copy-dir <LOCAL:REMOTE>` - Directories to copy to sandbox
+- `--env <KEY=VALUE>` - Environment variables to set in sandboxes
+- `-v, --verbose` - Enable verbose output with streaming test output
+
 ## Configuration
 
-Create a `offload.toml` file in your project root.
+Offload is configured via TOML files. The configuration has four main sections:
 
-### Core Settings
+### Core Settings (`[offload]`)
 
 ```toml
 [offload]
-max_parallel = 4          # Number of parallel sandboxes
-test_timeout_secs = 300   # Timeout per test
-retry_count = 2           # Retries for failed tests
+max_parallel = 10           # Number of parallel sandboxes
+test_timeout_secs = 900     # Per-batch timeout (15 min default)
+retry_count = 3             # Retries for failed tests
+working_dir = "."           # Working directory for tests
+stream_output = false       # Stream output in real-time
+```
 
-[report]
-output_dir = "test-results"
-junit = true
-junit_file = "junit.xml"
+### Provider Configuration (`[provider]`)
+
+Providers determine where tests execute.
+
+#### Local Provider
+
+Runs tests as local child processes:
+
+```toml
+[provider]
+type = "local"
+working_dir = "/path/to/project"
+shell = "/bin/bash"
+
+[provider.env]
+PYTHONPATH = "/app"
+```
+
+#### Modal Provider
+
+Runs tests in Modal cloud sandboxes with Dockerfile-based images:
+
+```toml
+[provider]
+type = "modal"
+app_name = "offload-sandbox"
+dockerfile = ".devcontainer/Dockerfile"
+timeout_secs = 600
+
+[provider.env]
+API_KEY = "${API_KEY}"
+```
+
+#### Default Provider
+
+Runs tests using custom shell commands for any cloud/execution environment:
+
+```toml
+[provider]
+type = "default"
+prepare_command = "./scripts/build-image.sh"      # Optional: returns image_id
+create_command = "./scripts/create.sh {image_id}" # Returns sandbox_id
+exec_command = "./scripts/exec.sh {sandbox_id} {command}"
+destroy_command = "./scripts/destroy.sh {sandbox_id}"
+download_command = "./scripts/download.sh {sandbox_id} {paths}"
+timeout_secs = 3600
+copy_dirs = ["./src:/app/src", "./tests:/app/tests"]
+
+[provider.env]
+MY_VAR = "value"
 ```
 
-## Test Discovery
+### Test Groups (`[groups.<name>]`)
 
-### pytest
+Groups organize tests by framework. All groups in a configuration must use the same framework type.
+
+#### pytest
 
 ```toml
-[discovery]
+[groups.python]
 type = "pytest"
 paths = ["tests"]
+markers = "not slow"
 python = "python3"
-markers = "not slow"  # Optional: filter by markers
+extra_args = ["-x"]
 ```
 
-### Cargo Test
+#### cargo (via nextest)
 
 ```toml
-[discovery]
+[groups.rust]
 type = "cargo"
-package = "my-crate"  # Optional: for workspaces
-features = ["feature1", "feature2"]
+package = "my-crate"
+features = ["test-utils"]
 include_ignored = false
 ```
 
-### Generic (Custom)
+#### Custom Framework
 
 ```toml
-[discovery]
-type = "generic"
-discover_command = "find tests -name 'test_*.py' | xargs -I {} basename {}"
-run_command = "pytest {tests} -v"
+[groups.custom]
+type = "default"
+discover_command = "jest --listTests --json | jq -r '.[]'"
+run_command = "jest {tests} --ci --reporters=jest-junit"
+result_file = "junit.xml"
+working_dir = "."
 ```
 
-The `{tests}` placeholder is replaced with discovered test names.
+### Report Configuration (`[report]`)
 
-## CLI Commands
+```toml
+[report]
+output_dir = "test-results"
+junit = true
+junit_file = "junit.xml"
+```
 
-```bash
-# Run all tests
-offload run
+## Environment Variable Expansion
+
+Provider environment variables support shell-style expansion:
+
+- `${VAR}` - Required variable (fails if not set)
+- `${VAR:-default}` - Optional with default value
+- `$$` - Escaped dollar sign
+
+```toml
+[provider.env]
+API_KEY = "${API_KEY}"
+DEBUG = "${DEBUG:-false}"
+PRICE = "$$100"
+```
 
-# Run with more parallelism
-offload run --parallel 8
+## Exit Codes
 
-# Discover tests without running
-offload collect
+| Code | Meaning |
+|------|---------|
+| 0 | All tests passed |
+| 1 | Some tests failed or weren't run |
+| 2 | All tests passed but some were flaky |
 
-# Validate configuration
-offload validate
+## Architecture
 
-# Initialize new config
-offload init --provider ssh --framework pytest
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         Orchestrator                             │
+│   Coordinates test discovery, scheduling, and result collection  │
+└─────────────────────────────────────────────────────────────────┘
+              │                                │
+              ▼                                ▼
+┌─────────────────────────┐      ┌─────────────────────────┐
+│       Framework          │      │        Provider          │
+│  - pytest                │      │  - local (processes)     │
+│  - cargo (nextest)       │      │  - modal (cloud)         │
+│  - default (custom)      │      │  - default (shell cmds)  │
+│                          │      │                          │
+│  discover() → tests      │      │  create_sandbox()        │
+│  run_command() → cmd     │      │  exec_stream()           │
+│  parse_results()         │      │  download()              │
+└─────────────────────────┘      │  terminate()             │
+                                  └─────────────────────────┘
+              │                                │
+              └────────────┬───────────────────┘
+                           ▼
+                  ┌────────────────┐
+                  │   Scheduler     │
+                  │  Distributes    │
+                  │  tests across   │
+                  │  sandboxes      │
+                  └────────────────┘
 ```
 
-## Example Configurations
+## Library Usage
 
-Example configurations have been provided in the root of this repo. See offload-*.toml for examples.
+Offload can also be used as a library:
 
+```rust
+use offload::config::{load_config, SandboxConfig};
+use offload::orchestrator::{Orchestrator, SandboxPool};
+use offload::provider::local::LocalProvider;
+use offload::framework::{TestFramework, pytest::PytestFramework};
 
-### Testing
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let config = load_config(std::path::Path::new("offload.toml"))?;
 
-Use the project to test itself with:
+    let provider = LocalProvider::new(Default::default());
+    let framework = PytestFramework::new(Default::default());
 
-```
-cargo run -- -c offload-modal.toml run
-```
+    // Discover tests
+    let tests = framework.discover(&[]).await?;
 
-(Requires valid Modal API key)
+    // Create sandbox pool
+    let sandbox_config = SandboxConfig {
+        id: "sandbox".to_string(),
+        working_dir: None,
+        env: vec![],
+        copy_dirs: vec![],
+    };
+    let mut sandbox_pool = SandboxPool::new();
+    sandbox_pool.populate(config.offload.max_parallel, &provider, &sandbox_config).await?;
+
+    // Run tests
+    let orchestrator = Orchestrator::new(config, framework, false);
+    let result = orchestrator.run_with_tests(&tests, sandbox_pool).await?;
+
+    std::process::exit(result.exit_code());
+}
+```
 
 ## License
 
-All Rights Reserved. See [LICENSE](LICENSE) for details.
+See LICENSE file.
diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
deleted file mode 100644
index 9edcebf6..00000000
--- a/STYLE_GUIDE.md
+++ /dev/null
@@ -1,49 +0,0 @@
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
-# Style Guide
-
-This project is "typechecker-first": prefer designs the compiler can validate statically.
-
-## Safety
-- **Unsafe is forbidden by default.**
-  - Add `#![forbid(unsafe_code)]` at crate roots unless explicitly waived for a specific, reviewed reason.
-- If unsafe is ever approved, it must be:
-  - isolated behind a safe API boundary
-  - documented with invariants
-  - covered by tests
-
-## Rust edition
-- Use **Rust 2024 edition** defaults for new code (unless a compelling reason exists).
-
-## Module layout
-- Do **not** use `mod.rs` in new code. Use the Rust 2018+ "non-mod-rs" module layout:
-  - `foo.rs` with submodules in `foo/…`
-
-## Error handling (no panics)
-- Do not use `unwrap()` / `expect()` in production code.
-  - In tests, they may be acceptable when they improve readability and failure messages.
-- Prefer typed errors and `Result`.
-- Prefer fallible conversions: `TryFrom` / `TryInto` over `as` casts when failure is possible.
-
-## API design
-- Prefer small, well-typed domain types ("newtypes") over primitives with comments.
-- Keep implementation details private by default (`pub` is opt-in).
-- Prefer explicit constructors over public fields unless the type is a plain data carrier.
-
-## Mutability & interior mutability
-- Prefer straightforward ownership + borrowing.
-- Avoid `Cell` / `RefCell` unless you have a measured need and a clear invariant.
-
-## Mutability gating rule (repeat)
-- If interior mutability is necessary, do not do it (fail your task), then
-- Ask your coordinator to explicitly authorize another subtask with interior mutability, with your justification.
-
-## Formatting & linting
-- Rustfmt is mandatory.
-- Clippy is mandatory; treat warnings as errors in CI/local workflows.
-- Avoid adding dependencies unless justified by the bead/task.
-
-## Refactor gating rule (repeat)
-If your implementation is blocked by a large refactor that you are not cleared to do:
-- do not do it
-- fail your task and ask the Coordinator to create/assign a bead for the refactor first
diff --git a/TESTING.md b/TESTING.md
index 094f5f5c..61f7f347 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -1,44 +1,119 @@
-<!-- Generated by rust-bucket v0.5.0. DO NOT EDIT BY HAND. -->
-
 # Testing
 
-## Required test runner
-Use `cargo-nextest` for running tests.
+This document describes how to test offload itself.
+
+## Running Tests
+
+Offload uses cargo nextest for testing:
+
+```bash
+cargo nextest run
+```
+
+Or with standard cargo test:
+
+```bash
+cargo test
+```
+
+## Test Organization
+
+Tests are organized as:
+
+- **Unit tests**: Inline `#[cfg(test)]` modules in source files
+- **Integration tests**: Would be in `tests/` directory
+
+Key test locations:
+
+| File | Tests |
+|------|-------|
+| `src/orchestrator/scheduler.rs` | Scheduling algorithms |
+| `src/connector.rs` | Shell execution and streaming |
+| `src/config.rs` | Environment variable expansion |
+| `src/provider/default.rs` | Command template building |
+| `src/config/schema.rs` | Configuration parsing |
+
+## Linting and Formatting
+
+Before committing, ensure code passes:
+
+```bash
+# Format check
+cargo fmt --check
+
+# Clippy lints (no warnings allowed)
+cargo clippy
+
+# Run tests
+cargo nextest run
+```
+
+## Test Patterns
+
+### Testing Async Code
+
+Use `#[tokio::test]` for async tests:
+
+```rust
+#[tokio::test]
+async fn test_run_stream_yields_exit_code_success() {
+    let connector = ShellConnector::new();
+    let mut stream = connector.run_stream("echo hello").await.unwrap();
+    // ...
+}
+```
+
+### Testing Configuration
+
+Parse inline TOML strings:
+
+```rust
+#[test]
+fn test_modal_provider_with_dockerfile() -> Result<(), Box<dyn std::error::Error>> {
+    let toml = r#"
+        [offload]
+        max_parallel = 4
+
+        [provider]
+        type = "modal"
+        app_name = "offload-sandbox"
+        dockerfile = ".devcontainer/Dockerfile"
 
-Rationale:
-- consistent reporting
-- better control of timeouts
-- parallel execution
+        [groups.test]
+        type = "pytest"
+    "#;
 
-## Hard timeout rule (agent safety)
-**Every full test run must complete within 120 seconds.**
+    let config: Config = toml::from_str(toml)?;
+    // assertions...
+    Ok(())
+}
+```
 
-If tests exceed 120 seconds, that is a failure. The goal is to prevent agent workflows from getting stuck in infinite loops or pathological hangs.
+### Testing with Environment Variables
 
-## Repository configuration
-The repo should include `.config/nextest.toml` with a global timeout:
+Use predictable environment variables in tests:
 
-- `profile.default.global-timeout = "120s"`
+```rust
+#[test]
+fn test_expand_env_value_var_set() -> Result<(), String> {
+    // HOME is always set in Unix environments
+    let result = expand_env_value("${HOME}")?;
+    assert!(!result.is_empty());
+    Ok(())
+}
 
-## Standard commands
-Run these before declaring a task "done":
+#[test]
+fn test_expand_env_value_var_unset() {
+    // Use a guaranteed non-existent variable
+    let result = expand_env_value("${_OFFLOAD_TEST_NONEXISTENT_VAR}");
+    assert!(result.is_err());
+}
+```
 
-- Format:
-  - `cargo fmt --check`
-- Lint:
-  - `cargo clippy --all-targets --all-features`
-- Tests (strict timeout):
-  - `cargo nextest run`
+## Definition of Done
 
-## Guidance for writing tests
-- Prefer deterministic tests (avoid timing sensitivity).
-- If a test is intentionally slow, mark it `#[ignore]` and document how/when to run it.
-- When using randomness, seed it.
-- Keep unit tests close to the code they verify; keep integration tests in `tests/`.
+For any code change to be considered complete:
 
-## If tests exceed the deadline
-- Treat it as a bug
-- Add a reproduction and fix the underlying deadlock/infinite loop/slow test
-- Do not "just bump the timeout". Timeout bumping is only performed by humans
-- If you cannot get all tests to complete in less time, do not delete tests by other agents or existing tests
-- Prefer to fail your task instead
+1. `cargo fmt --check` passes
+2. `cargo clippy` passes with no warnings
+3. `cargo nextest run` passes
diff --git a/src/config.rs b/src/config.rs
index 18a15540..00faadad 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -4,7 +4,7 @@
 //! from TOML files or strings. The configuration schema defines all settings
 //! for providers, test frameworks, and reporting.
 //!
-//! # The Configuration File Format is described in the README.
+//! See the README.md for full configuration examples.
 
 pub mod schema;
 
diff --git a/src/lib.rs b/src/lib.rs
index b11dabf3..3f39d944 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,7 @@
 //! implements the [`SandboxProvider`] trait:
 //!
 //! - [`provider::local::LocalProvider`] - Run tests as local processes
+//! - [`provider::modal::ModalProvider`] - Run tests in Modal cloud sandboxes
 //! - [`provider::default::DefaultProvider`] - Run tests using custom shell commands
 //!
 //! ### Framework ([`framework`])
diff --git a/src/provider.rs b/src/provider.rs
index 7a45dd60..a7cb97c8 100644
--- a/src/provider.rs
+++ b/src/provider.rs
@@ -37,6 +37,7 @@
 //! | Provider | Module | Description |
 //! |----------|--------|-------------|
 //! | Local | [`local`] | Run tests as local child processes |
+//! | Modal | [`modal`] | Run tests in Modal cloud sandboxes |
 //! | Default | [`default`] | Run tests via custom shell commands |
 //!
 //! # Implementing a Custom Provider