openai
diff --git a/‎AGENTS.md‎
Lines changed: 5 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/evals/realtime_evals/.gitignore‎
Lines changed: 29 additions & 0 deletions b/‎examples/evals/realtime_evals/.gitignore‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎examples/evals/realtime_evals/Makefile‎
Lines changed: 40 additions & 0 deletions b/‎examples/evals/realtime_evals/Makefile‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎examples/evals/realtime_evals/README.md‎
Lines changed: 34 additions & 11 deletions b/‎examples/evals/realtime_evals/README.md‎
Lines changed: 34 additions & 11 deletions
diff --git a/‎examples/evals/realtime_evals/crawl_harness/README.md‎
Lines changed: 2 additions & 0 deletions b/‎examples/evals/realtime_evals/crawl_harness/README.md‎
Lines changed: 2 additions & 0 deletions
@@ -37,3 +37,8 @@ These are considered priority 0 issues for this repo, in addition to the normal
 - Check that code identifiers remain descriptive (no leftover placeholder names) and that repeated values are factored into constants when practical.
 - Ensure notebooks or scripts document any required environment variables instead of hard-coding secrets or keys.
 - Confirm metadata files (`registry.yaml`, `authors.yaml`) stay in sync with new or relocated content.
+
+## Recent Learnings
+
+- **Realtime eval shared imports can resolve the wrong module under pytest** -> Add `shared/__init__.py` and ensure tests prepend `examples/evals/realtime_evals` to `sys.path` before importing `shared.*` -> Prevents collection failures caused by unrelated installed packages named `shared`.
+- **Run-level grades can be overweighted by long simulations** -> Store turn-level grades on the matching turn and trace-level grades on one row per simulation instead of copying them onto every row -> Keeps `results.csv` row semantics intact and prevents summary means from favoring longer conversations.
@@ -0,0 +1,29 @@
+# Python bytecode and caches
+__pycache__/
+*.py[cod]
+*.pyo
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# Python tooling caches
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+
+# Jupyter
+.ipynb_checkpoints/
+
+# Local Python version management
+.python-version
+
+# uv local cache
+.uv-cache/
+
+# Harness outputs
+*_harness/results/
@@ -0,0 +1,40 @@
+UV_BIN := $(strip $(shell command -v uv 2>/dev/null))
+PYTHON_BIN := $(strip $(shell command -v python 2>/dev/null || command -v python3 2>/dev/null))
+VENV_DIR := .venv
+VENV_PYTHON := $(VENV_DIR)/bin/python
+
+ifeq ($(UV_BIN),)
+RUFF := $(VENV_DIR)/bin/ruff
+MYPY := $(VENV_DIR)/bin/mypy
+PYTEST := $(VENV_DIR)/bin/pytest
+else
+RUFF := uv run --with ruff ruff
+MYPY := uv run --with mypy --with pandas-stubs --with types-seaborn --with types-tqdm mypy
+PYTEST := uv run --with pytest pytest
+endif
+
+.PHONY: install format lint lint-fix typecheck test
+
+install:
+ifeq ($(UV_BIN),)
+	$(PYTHON_BIN) -m venv $(VENV_DIR)
+	$(VENV_PYTHON) -m pip install -r requirements.txt -r requirements-dev.txt
+else
+	uv venv $(VENV_DIR)
+	uv sync --group dev
+endif
+
+format:
+	$(RUFF) format .
+
+lint:
+	$(RUFF) check .
+
+lint-fix:
+	$(RUFF) check . --fix
+
+typecheck:
+	$(MYPY) --explicit-package-bases .
+
+test:
+	$(PYTEST)
@@ -11,20 +11,33 @@ Depending on your realtime eval maturity, point Codex (or your preferred coding
 
 ## Quickstart
 
-Python 3.9+ required.
+Python 3.12+ required.
 
 ```bash
-pip install -r requirements.txt
+make install
+source .venv/bin/activate
 export OPENAI_API_KEY="your_api_key"
 ```
 
-Run a first command per harness:
+`make install` creates the local `.venv` and installs both runtime and dev dependencies. It uses `uv` when available and otherwise falls back to `python -m venv` plus `pip install -r requirements.txt -r requirements-dev.txt`.
 
-- Crawl: `python crawl_harness/run_realtime_evals.py`
+Run a first command per harness. If uv is not installed, replace `uv run` with `python` and run these scripts with your `.venv` activated:
+
+- Crawl: `uv run python crawl_harness/run_realtime_evals.py`
 - Walk: install ffmpeg (`brew install ffmpeg`), then:
-  - `python walk_harness/generate_audio.py`
-  - `python walk_harness/run_realtime_evals.py`
-- Run: `python run_harness/run_realtime_evals.py --max-examples 1`
+  - `uv run python walk_harness/generate_audio.py`
+  - `uv run python walk_harness/run_realtime_evals.py`
+- Run: `uv run python run_harness/run_realtime_evals.py --max-examples 1`
+
+## Dev commands
+Use the root `Makefile` for common checks. Run `make install` first to create `.venv`. These targets work with or without `uv`: when `uv` is installed they run through `uv run`, and otherwise they use the matching tool binaries from the local `.venv`.
+
+- `make install`
+- `make format`
+- `make lint`
+- `make lint-fix`
+- `make typecheck`
+- `make test`
 
 ## [Crawl (synthetic single-turn)](./crawl_harness)
 
@@ -38,7 +51,7 @@ Run a first command per harness:
 Run:
 
 ```
-python crawl_harness/run_realtime_evals.py
+uv run python crawl_harness/run_realtime_evals.py
 ```
 
 ## [Walk (saved audio replay)](./walk_harness)
@@ -53,8 +66,8 @@ python crawl_harness/run_realtime_evals.py
 Run:
 
 ```
-python walk_harness/generate_audio.py
-python walk_harness/run_realtime_evals.py
+uv run python walk_harness/generate_audio.py
+uv run python walk_harness/run_realtime_evals.py
 ```
 
 ## [Run (multi-turn simulation)](./run_harness)
@@ -69,7 +82,7 @@ python walk_harness/run_realtime_evals.py
 Run:
 
 ```
-python run_harness/run_realtime_evals.py
+uv run python run_harness/run_realtime_evals.py
 ```
 
 ## Results layout
@@ -78,8 +91,17 @@ Each harness writes into its own `results/` folder:
 
 - `results/<run_id>/results.csv`: per-example outputs and grades
 - `results/<run_id>/summary.json`: aggregate metrics
+- `results/<run_id>/plots/*.png`: warm-editorial summary charts for scores, latency, tokens, and run shape
 - `results/<run_id>/events/*.jsonl`: full realtime event stream per datapoint
 
+The shared typed representation of these artifacts for the crawl, walk, and run harnesses lives in `shared/result_types.py`.
+
+To render charts for an existing run after the fact:
+
+```bash
+uv run python plot_eval_results.py --run-dir run_harness/results/<run_id>
+```
+
 ## Common CLI flags
 
 All harnesses share a core set of flags so you can switch between them easily:
@@ -88,5 +110,6 @@ All harnesses share a core set of flags so you can switch between them easily:
 - `--model` (assistant under test), `--system-prompt-file`, `--tools-file`
 - `--chunk-ms`, `--sample-rate-hz`, `--real-time`
 - `--max-examples` (quick smoke tests)
+- `--skip-plots` (skip post-run PNG chart generation)
 
 Run harness adds multi-turn and simulator-specific flags (see its README).
@@ -28,6 +28,7 @@ Each run writes:
 
 - `results.csv`: Per-example outputs and grades.
 - `summary.json`: Aggregate metrics (latency + correctness).
+- `plots/*.png`: Styled score, latency, token, and status charts.
 - `audio/<example_id>/input.wav`: The TTS input audio.
 - `audio/<example_id>/output.wav`: The model output audio (if any).
 - `events/<example_id>.jsonl`: Realtime event stream for the datapoint.
@@ -66,6 +67,7 @@ Common options:
 - `--output-audio-format`: Output audio format (default `pcm16`).
 - `--real-time`: Stream audio in real-time cadence.
 - `--max-examples`: Limit number of examples for quick checks.
+- `--skip-plots`: Skip post-run PNG chart generation.
 
 ## Adapt