Skip to content

Commit 40b7195

Browse files
Surface forfeit-rate log-linear fit in lab_status for pretraining runs (#56)
* lab: surface forfeit-rate log-linear fit in lab_status for pretrain runs The log-linear fit on forfeit rate has become the most informative late-stage convergence signal — it keeps moving after val_loss, accuracy, legal_move_rate, and late_legal_move_rate have all plateaued. The dashboard chart shows it, but the MCP status tool didn't, so a monitoring agent couldn't see the trend without reading the val log lines by hand. New helper `read_pretrain_val_summary(trial)` in monitor.py scans the trial's metrics.jsonl for val records with game_completion_rate, returns: - latest: all key val fields from the most recent val record - forfeit_fit: log-linear regression over the most recent half of (step, forfeit_rate) points, reported as slope_per_step, half_life_steps, current_forfeit, n_points. Matches the dashboard's fit exactly (same second-half-of-history window). `status()` in runner.py attaches the summary under `pretrain` on running trial rows when run_type == "pretrain". Metrics.jsonl is the source of truth — no new state tracked on the Trial object, so recovery and resume Just Work. Updated lab_status docstring so agents know the new field exists. * Address PR review feedback 1. Drop the redundant local 'import math as _math' — math is already imported at module level. 2. Add test coverage for read_pretrain_val_summary: - returns None when run_dir is missing - returns None when metrics.jsonl is missing - returns None when there are no val records - returns None when val records exist but lack game_completion_rate (adapter runs) - returns only 'latest' when fewer than 4 val records are present - returns correct slope and half_life for a synthetic exponential decay series (OLS recovers the known k within 1e-6) - omits forfeit_fit entirely when all forfeit rates are exactly 0 - latest dict carries all documented fields - current_forfeit tracks the unfiltered last-of-full-series value Plus runner integration tests: - runner.status() attaches 'pretrain' block to pretrain run_type rows - runner.status() omits 'pretrain' block for adapter run_type rows 3. Clarify forfeit_fit fields in the docstring: list current_forfeit and explain that it's the last value of the *full* forfeit series (unlike slope/half_life, which come from the half-window OLS).
1 parent 0f96df2 commit 40b7195

File tree

5 files changed

+354
-5
lines changed

5 files changed

+354
-5
lines changed

pawn/lab/monitor.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,105 @@ def read_metrics(
116116
trial.best_accuracy = acc
117117

118118

119+
def read_pretrain_val_summary(trial: Trial) -> dict[str, Any] | None:
120+
"""Scan the trial's metrics.jsonl for the latest pretraining val record
121+
and compute a log-linear fit on forfeit rate over the most recent half
122+
of the history.
123+
124+
Returns a dict with:
125+
latest: the latest val record's key fields (game_completion_rate,
126+
avg_plies_completed, forfeit min/max/median, legal, late_legal,
127+
val_loss, step)
128+
forfeit_fit: {slope_per_step, half_life_steps, n_points,
129+
current_forfeit} computed from the most recent half of the
130+
(step, forfeit_rate) series — matches the dashboard's
131+
log-linear fit. The OLS itself is restricted to strictly
132+
positive forfeit rates (log(0) would blow up), but
133+
`current_forfeit` is always the last observed forfeit rate
134+
from the full series — including 0.0 if the most recent eval
135+
had no forfeits. Omitted if fewer than 4 val records have
136+
game_completion_rate (not a pretraining run or too early),
137+
or if fewer than 3 positive-forfeit points land in the
138+
half-window.
139+
140+
Returns None if no val records are available.
141+
"""
142+
if trial.run_dir is None:
143+
return None
144+
metrics_path = Path(trial.run_dir) / "metrics.jsonl"
145+
if not metrics_path.exists():
146+
return None
147+
148+
steps: list[int] = []
149+
forfeit_rates: list[float] = []
150+
latest: dict[str, Any] | None = None
151+
152+
try:
153+
with open(metrics_path) as f:
154+
for line in f:
155+
try:
156+
rec = json.loads(line)
157+
except (json.JSONDecodeError, ValueError):
158+
continue
159+
if rec.get("type") != "val":
160+
continue
161+
gc = rec.get("val/game_completion_rate")
162+
if gc is None:
163+
continue
164+
step = rec.get("step")
165+
if step is None:
166+
continue
167+
steps.append(int(step))
168+
forfeit_rates.append(1.0 - float(gc))
169+
latest = rec
170+
except OSError:
171+
return None
172+
173+
if not latest:
174+
return None
175+
176+
summary: dict[str, Any] = {
177+
"latest": {
178+
"step": latest.get("step"),
179+
"val_loss": latest.get("val/loss"),
180+
"game_completion_rate": latest.get("val/game_completion_rate"),
181+
"avg_plies_completed": latest.get("val/avg_plies_completed"),
182+
"forfeit_ply_min": latest.get("val/min_forfeit_ply"),
183+
"forfeit_ply_max": latest.get("val/max_forfeit_ply"),
184+
"forfeit_ply_median": latest.get("val/median_forfeit_ply"),
185+
"legal_move_rate": latest.get("val/legal_move_rate"),
186+
"late_legal_move_rate": latest.get("val/late_legal_move_rate"),
187+
},
188+
}
189+
190+
n = len(steps)
191+
if n >= 4:
192+
half = n // 2
193+
xs = steps[half:]
194+
ys = forfeit_rates[half:]
195+
# Only fit on strictly positive forfeit rates
196+
pos = [(x, y) for x, y in zip(xs, ys) if y > 0]
197+
if len(pos) >= 3:
198+
xs_f = [float(x) for x, _ in pos]
199+
ys_log = [math.log(y) for _, y in pos]
200+
n_pts = len(xs_f)
201+
mean_x = sum(xs_f) / n_pts
202+
mean_y = sum(ys_log) / n_pts
203+
num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs_f, ys_log))
204+
den = sum((x - mean_x) ** 2 for x in xs_f)
205+
if den > 0:
206+
slope = num / den
207+
half_life = math.log(2) / abs(slope) if slope != 0 else None
208+
summary["forfeit_fit"] = {
209+
"slope_per_step": slope,
210+
"half_life_steps": half_life,
211+
"n_points": n_pts,
212+
"current_forfeit": forfeit_rates[-1],
213+
}
214+
215+
return summary
216+
217+
119218
def check_health(trial: Trial) -> str | None:
120219
"""Return a health issue string, or None if healthy."""
121220
loss = trial.last_train_loss

pawn/lab/runner.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@
1717
from pathlib import Path
1818
from typing import Any
1919

20-
from pawn.lab.monitor import check_health, is_alive, read_metrics
20+
from pawn.lab.monitor import (
21+
check_health,
22+
is_alive,
23+
read_metrics,
24+
read_pretrain_val_summary,
25+
)
2126
from pawn.lab.state import Trial, _format_duration, _now_iso
2227

2328
log = logging.getLogger("pawn.lab")
@@ -521,7 +526,7 @@ def status(self) -> dict[str, Any]:
521526
for t in self.trials.values():
522527
if t.status == "running":
523528
cfg = t.config or t.params
524-
running.append({
529+
row: dict[str, Any] = {
525530
"trial": t.trial_id, "strategy": t.strategy,
526531
"step": t.current_step, "total": t.total_steps,
527532
"sps": round(t.steps_per_sec, 2),
@@ -532,7 +537,14 @@ def status(self) -> dict[str, Any]:
532537
"key_hp": {k: v for k, v in cfg.items()
533538
if k in ("lr", "lora_rank", "bottleneck_dim",
534539
"density", "d_model", "n_layers", "batch_size")},
535-
})
540+
}
541+
# For pretraining runs, surface game-completion metrics and
542+
# the log-linear forfeit-rate fit (matches the dashboard chart).
543+
if cfg.get("run_type") == "pretrain":
544+
pretrain = read_pretrain_val_summary(t)
545+
if pretrain:
546+
row["pretrain"] = pretrain
547+
running.append(row)
536548
elapsed = time.time() - self.start_time
537549
cost = (self.cost_per_hour * elapsed / 3600) if self.cost_per_hour else None
538550
return {

pawn/lab/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def _runner(ctx: Context) -> TrialRunner:
3636

3737
@mcp.tool
3838
async def lab_status(ctx: Context) -> dict[str, Any]:
39-
"""Compact lab status: GPUs, running trials (ID, strategy, key HPs, step/total, ETA, train_loss, train_acc, val_loss, val_acc), counts, elapsed time, cost. Train metrics update every log_interval steps; val metrics update at eval_interval. Use lab_log for real-time stdout."""
39+
"""Compact lab status: GPUs, running trials (ID, strategy, key HPs, step/total, ETA, train_loss, train_acc, val_loss, val_acc), counts, elapsed time, cost. Train metrics update every log_interval steps; val metrics update at eval_interval. Use lab_log for real-time stdout.
40+
41+
For running pretraining trials, each row also carries a `pretrain` block with the latest game-completion metrics (game_completion_rate, avg_plies_completed, forfeit_ply min/max/median, legal/late_legal) and a `forfeit_fit` sub-block with a log-linear fit over the most recent half of the forfeit-rate history (slope_per_step, half_life_steps, current_forfeit, n_points). The fit is the primary late-stage convergence signal — it keeps moving after val_loss plateaus."""
4042
return _runner(ctx).status()
4143

4244

tests/lab/test_monitor.py

Lines changed: 179 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99

1010
import pytest
1111

12-
from pawn.lab.monitor import check_health, is_alive, read_metrics
12+
from pawn.lab.monitor import (
13+
check_health,
14+
is_alive,
15+
read_metrics,
16+
read_pretrain_val_summary,
17+
)
1318
from pawn.lab.state import Trial
1419

1520

@@ -318,3 +323,176 @@ def test_no_total_steps_uses_500_threshold(self):
318323
assert issue is not None
319324
assert isinstance(issue, str) and len(issue) > 0
320325
assert "NaN" in issue or "Inf" in issue
326+
327+
328+
# =====================================================================
329+
# read_pretrain_val_summary
330+
# =====================================================================
331+
332+
333+
def _val_record(step: int, gc: float, **kwargs) -> dict:
334+
"""Build a minimal val record with game_completion_rate."""
335+
rec = {
336+
"type": "val",
337+
"step": step,
338+
"val/loss": 3.0,
339+
"val/game_completion_rate": gc,
340+
"val/avg_plies_completed": 300.0,
341+
"val/min_forfeit_ply": 10.0,
342+
"val/max_forfeit_ply": 400.0,
343+
"val/median_forfeit_ply": 100.0,
344+
"val/legal_move_rate": 0.997,
345+
"val/late_legal_move_rate": 0.993,
346+
}
347+
rec.update(kwargs)
348+
return rec
349+
350+
351+
class TestReadPretrainValSummary:
352+
def test_no_run_dir_returns_none(self):
353+
trial = _make_trial()
354+
assert read_pretrain_val_summary(trial) is None
355+
356+
def test_missing_metrics_file_returns_none(self, tmp_path):
357+
trial = _make_trial()
358+
trial.run_dir = str(tmp_path / "does_not_exist")
359+
assert read_pretrain_val_summary(trial) is None
360+
361+
def test_no_val_records_returns_none(self, tmp_path):
362+
run_dir = tmp_path / "run_x"
363+
_write_metrics_file(run_dir / "metrics.jsonl", [
364+
{"type": "train", "step": 10, "train/loss": 2.0},
365+
])
366+
trial = _make_trial()
367+
trial.run_dir = str(run_dir)
368+
assert read_pretrain_val_summary(trial) is None
369+
370+
def test_val_records_without_game_completion_return_none(self, tmp_path):
371+
"""Adapter runs log val records but no game_completion_rate."""
372+
run_dir = tmp_path / "run_x"
373+
_write_metrics_file(run_dir / "metrics.jsonl", [
374+
{"type": "val", "step": 100, "val/loss": 2.0, "val/accuracy": 0.5},
375+
{"type": "val", "step": 200, "val/loss": 1.9, "val/accuracy": 0.55},
376+
])
377+
trial = _make_trial()
378+
trial.run_dir = str(run_dir)
379+
assert read_pretrain_val_summary(trial) is None
380+
381+
def test_returns_latest_without_fit_when_too_few_records(self, tmp_path):
382+
"""Need n >= 4 val records to even attempt the fit."""
383+
run_dir = tmp_path / "run_x"
384+
_write_metrics_file(run_dir / "metrics.jsonl", [
385+
_val_record(1000, 0.5),
386+
_val_record(2000, 0.4),
387+
_val_record(3000, 0.3),
388+
])
389+
trial = _make_trial()
390+
trial.run_dir = str(run_dir)
391+
summary = read_pretrain_val_summary(trial)
392+
393+
assert summary is not None
394+
assert "latest" in summary
395+
assert summary["latest"]["step"] == 3000
396+
assert summary["latest"]["game_completion_rate"] == pytest.approx(0.3)
397+
assert "forfeit_fit" not in summary
398+
399+
def test_returns_fit_on_known_series(self, tmp_path):
400+
"""Construct a known exponential decay and verify OLS recovers it.
401+
402+
We pick forfeit_rate(step) = exp(-k * step + b) so log(forfeit) is
403+
exactly linear in step with slope -k. The fit uses the second half
404+
of the history.
405+
"""
406+
run_dir = tmp_path / "run_x"
407+
k = 1e-5 # half-life = ln(2)/k ~= 69314 steps
408+
b = math.log(0.5) # forfeit(0) = 0.5
409+
410+
records = []
411+
n = 20
412+
for i in range(n):
413+
step = (i + 1) * 1000
414+
forfeit = math.exp(-k * step + b)
415+
gc = 1.0 - forfeit
416+
records.append(_val_record(step, gc))
417+
418+
_write_metrics_file(run_dir / "metrics.jsonl", records)
419+
trial = _make_trial()
420+
trial.run_dir = str(run_dir)
421+
summary = read_pretrain_val_summary(trial)
422+
423+
assert summary is not None
424+
assert "forfeit_fit" in summary
425+
fit = summary["forfeit_fit"]
426+
assert fit["slope_per_step"] == pytest.approx(-k, rel=1e-6)
427+
assert fit["half_life_steps"] == pytest.approx(math.log(2) / k, rel=1e-6)
428+
# Second half of n=20 → 10 points
429+
assert fit["n_points"] == 10
430+
# current_forfeit is the last overall series value, not the fit window's
431+
expected_current = math.exp(-k * (n * 1000) + b)
432+
assert fit["current_forfeit"] == pytest.approx(expected_current)
433+
434+
def test_all_zero_forfeit_omits_fit(self, tmp_path):
435+
"""If every forfeit rate is exactly 0 (perfect completion), the OLS
436+
window has no positive points to fit and forfeit_fit is omitted."""
437+
run_dir = tmp_path / "run_x"
438+
_write_metrics_file(run_dir / "metrics.jsonl", [
439+
_val_record(1000, 1.0),
440+
_val_record(2000, 1.0),
441+
_val_record(3000, 1.0),
442+
_val_record(4000, 1.0),
443+
_val_record(5000, 1.0),
444+
])
445+
trial = _make_trial()
446+
trial.run_dir = str(run_dir)
447+
summary = read_pretrain_val_summary(trial)
448+
449+
assert summary is not None
450+
assert summary["latest"]["game_completion_rate"] == pytest.approx(1.0)
451+
assert "forfeit_fit" not in summary
452+
453+
def test_latest_records_carries_all_fields(self, tmp_path):
454+
"""All documented latest fields are present when available."""
455+
run_dir = tmp_path / "run_x"
456+
_write_metrics_file(run_dir / "metrics.jsonl", [
457+
_val_record(
458+
5000, 0.9,
459+
**{"val/avg_plies_completed": 321.5,
460+
"val/min_forfeit_ply": 25.0,
461+
"val/max_forfeit_ply": 300.0,
462+
"val/median_forfeit_ply": 120.0,
463+
"val/loss": 2.9,
464+
"val/legal_move_rate": 0.996,
465+
"val/late_legal_move_rate": 0.992}),
466+
])
467+
trial = _make_trial()
468+
trial.run_dir = str(run_dir)
469+
summary = read_pretrain_val_summary(trial)
470+
471+
assert summary is not None
472+
latest = summary["latest"]
473+
assert latest["step"] == 5000
474+
assert latest["val_loss"] == pytest.approx(2.9)
475+
assert latest["game_completion_rate"] == pytest.approx(0.9)
476+
assert latest["avg_plies_completed"] == pytest.approx(321.5)
477+
assert latest["forfeit_ply_min"] == pytest.approx(25.0)
478+
assert latest["forfeit_ply_max"] == pytest.approx(300.0)
479+
assert latest["forfeit_ply_median"] == pytest.approx(120.0)
480+
assert latest["legal_move_rate"] == pytest.approx(0.996)
481+
assert latest["late_legal_move_rate"] == pytest.approx(0.992)
482+
483+
def test_current_forfeit_is_last_of_full_series(self, tmp_path):
484+
"""`current_forfeit` tracks the unfiltered last value, even when it's
485+
zero and got dropped from the OLS window."""
486+
run_dir = tmp_path / "run_x"
487+
# 10 decaying records, then a final record at 100% completion (forfeit=0)
488+
records = [_val_record(i * 1000, 1.0 - math.exp(-i * 0.2)) for i in range(1, 11)]
489+
records.append(_val_record(11_000, 1.0)) # forfeit = 0.0
490+
_write_metrics_file(run_dir / "metrics.jsonl", records)
491+
trial = _make_trial()
492+
trial.run_dir = str(run_dir)
493+
summary = read_pretrain_val_summary(trial)
494+
495+
assert summary is not None
496+
# If forfeit_fit was computed, current_forfeit should be the unfiltered last value
497+
if "forfeit_fit" in summary:
498+
assert summary["forfeit_fit"]["current_forfeit"] == pytest.approx(0.0)

0 commit comments

Comments
 (0)