Fix legal grid alignment, add game completion eval, benchmark improvements (#54)

thomas-schweich · web-flow · commit e73f6fff0299 · 2026-04-09T21:39:05.000-07:00
* Fix legal grid off-by-one, add game completion eval, benchmark improvements

Legal grid alignment fix:
- legal_grid from compute_legal_move_masks is aligned with move_ids (legal
  moves at position *before* each move), but the trainer checks it against
  targets which are shifted by one (target[ply] = move_ids[ply+1]). Shift
  the grid by one ply in create_validation_set so it aligns with targets.
  This was causing legal_move_rate to always report 0%.

Game completion eval:
- New compute_game_completion() walks each game ply-by-ply checking whether
  the model's argmax prediction is legal. Reports: game_completion_rate
  (fraction of games without any illegal move), avg_pct_completion (mean
  fraction completed before forfeit), avg_plies_to_forfeit.
- Computed on 64 val games at each eval_interval using dense token masks.

Benchmark improvements:
- CPU/RAM reporting now checks cgroup limits (v1 and v2) before falling
  back to /proc, so containers report their actual allocation instead of
  the host's full resources.
- Default warmup iterations bumped from 3 to 10 — torch.compile needs
  more iterations to fully optimize, inflating timed results otherwise.

Theoretical ceiling script:
- Add --max-ply flag (was hardcoded to 255).

* Fix test to use targets as ground-truth preds after legal grid shift

The legal grid in create_validation_set is now shifted by one ply to
align with targets. The test was using input_ids as predictions, which
matched the old unshifted grid. Switch to targets.

* Address PR review feedback

- Remove dead gc_targets variable and unused n_checked counter
- Rename avg_plies_to_forfeit → avg_plies_completed (completed games
  contribute their full game_length to the average)
- Free all GPU tensors in game completion eval cleanup
- Move chess_engine import to top of trainer.py
- Extract shift_legal_mask() into pawn/data.py to deduplicate the
  np.roll + zero-fill pattern between data.py and trainer.py
- Use math.ceil for fractional CPU counts in cgroup detection
diff --git a/pawn/data.py b/pawn/data.py
@@ -240,6 +240,22 @@ def _watchdog():
             step += 1
 
 
+def shift_legal_mask(mask: np.ndarray) -> np.ndarray:
+    """Shift a legal move mask forward by one ply to align with CLM targets.
+
+    The engine's legal masks are indexed by position *before* each move:
+    mask[ply] = legal moves at position ply.  But CLM targets[ply] is the
+    *next* move (= move_ids[ply+1]), so we need legal moves at position
+    ply+1.  This rolls the mask by -1 along the ply axis and zeros the
+    last entry (no next move at the final ply).
+
+    Works for any mask shape (B, T, ...).
+    """
+    shifted = np.roll(mask, -1, axis=1)
+    shifted[:, -1] = 0
+    return shifted
+
+
 def create_validation_set(
     n_games: int, max_ply: int, seed: int,
     discard_ply_limit: bool = False,
@@ -266,9 +282,11 @@ def create_validation_set(
         "loss_mask": torch.from_numpy(loss_mask),
     }
 
-    # Compute legal move masks for evaluating legal move rate
+    # Compute legal move masks for evaluating legal move rate.
+    # Shift by one ply so legal_grid[ply] aligns with targets[ply]
+    # (see shift_legal_mask docstring).
     legal_grid, _legal_promo = engine.compute_legal_move_masks(move_ids, game_lengths)
-    batch["legal_grid"] = torch.from_numpy(legal_grid).long()
+    batch["legal_grid"] = torch.from_numpy(shift_legal_mask(legal_grid)).long()
     batch["game_lengths"] = torch.from_numpy(game_lengths).long()
 
     if no_outcome and prepend_outcome:
diff --git a/pawn/trainer.py b/pawn/trainer.py
@@ -14,15 +14,17 @@
 import time
 from datetime import datetime, timezone
 
+import numpy as np
 import psutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 
+import chess_engine as engine
 from pawn.config import CLMConfig, TrainingConfig
 from pawn.model import PAWNCLM
-from pawn.data import CLMDataset, create_validation_set
+from pawn.data import CLMDataset, create_validation_set, shift_legal_mask
 from pawn.logging import MetricsLogger
 
 from pawn.data_utils import unpack_grid
@@ -239,6 +241,68 @@ def compute_legal_move_rate_from_preds(
 
 
 
+def compute_game_completion(
+    preds: torch.Tensor,
+    legal_mask: torch.Tensor,
+    loss_mask: torch.Tensor,
+    game_lengths: torch.Tensor,
+) -> dict[str, float]:
+    """Measure how often the model gets through a full game without illegal moves.
+
+    For each game, walks ply-by-ply checking whether the argmax prediction is
+    legal.  The first illegal move is a "forfeit".
+
+    Args:
+        preds: (B, T) argmax token predictions (aligned with targets)
+        legal_mask: (B, T, V) bool — legal token mask (shifted to align with targets)
+        loss_mask: (B, T) bool — which positions are valid
+        game_lengths: (B,) int — number of valid plies per game
+
+    Returns dict with:
+        game_completion_rate: fraction of games with zero illegal moves
+        avg_pct_completion: mean fraction of game completed before forfeit
+        avg_plies_completed: mean plies completed before first illegal move.
+            Games with no illegal moves contribute their full game_length.
+    """
+    B, T = preds.shape
+
+    with torch.no_grad():
+        n_complete = 0
+        pct_completions = []
+        plies_completed = []
+
+        for b in range(B):
+            gl = min(int(game_lengths[b].item()), T)
+            forfeit_ply = -1
+            for p in range(gl):
+                if not loss_mask[b, p]:
+                    continue
+                # Skip plies with no legal moves (end-of-game padding)
+                if not legal_mask[b, p].any():
+                    continue
+                token = int(preds[b, p].item())
+                if token < legal_mask.shape[2] and not legal_mask[b, p, token]:
+                    forfeit_ply = p
+                    break
+                elif token >= legal_mask.shape[2]:
+                    forfeit_ply = p
+                    break
+
+            if forfeit_ply < 0:
+                n_complete += 1
+                pct_completions.append(1.0)
+                plies_completed.append(float(gl))
+            else:
+                pct_completions.append(forfeit_ply / gl if gl > 0 else 0.0)
+                plies_completed.append(float(forfeit_ply))
+
+    return {
+        "game_completion_rate": n_complete / B if B > 0 else 0.0,
+        "avg_pct_completion": sum(pct_completions) / len(pct_completions) if pct_completions else 0.0,
+        "avg_plies_completed": sum(plies_completed) / len(plies_completed) if plies_completed else 0.0,
+    }
+
+
 def _get_grad_norm(model: nn.Module) -> float:
     grads = [p.grad.data for p in model.parameters() if p.grad is not None]
     if not grads:
@@ -513,6 +577,39 @@ def evaluate(self) -> dict[str, float]:
             torch.cuda.empty_cache()
         avg = {f"val/{k}": v / n_batches for k, v in total_metrics.items()}
         avg["val/perplexity"] = math.exp(min(avg["val/loss"], 20.0))
+
+        # Game completion eval: can the model get through entire games
+        # without picking an illegal move?  Uses a small subset to avoid
+        # materializing a large dense (B, T, vocab) token mask.
+        if "game_lengths" in self.val_data:
+            gc_n = min(64, n)
+            gc_input = self.val_data["input_ids"][:gc_n].to(self.device)
+            gc_loss_mask = self.val_data["loss_mask"][:gc_n].to(self.device)
+            gc_game_lengths = self.val_data["game_lengths"][:gc_n].to(self.device)
+            move_ids = self.val_data["input_ids"][:gc_n].numpy().astype(np.int16)
+            gl_np = self.val_data["game_lengths"][:gc_n].numpy().astype(np.int16)
+            vocab_size = self.model_cfg.vocab_size
+
+            with torch.no_grad():
+                with torch.amp.autocast(self.device, enabled=self.cfg.use_amp):
+                    hidden = model.forward_eval(gc_input, gc_loss_mask)
+                    gc_logits = model.lm_head(hidden)
+                gc_preds = gc_logits.argmax(dim=-1)
+
+            legal_tokens = engine.compute_legal_token_masks(move_ids, gl_np, vocab_size)
+            legal_mask_t = torch.from_numpy(
+                shift_legal_mask(legal_tokens)
+            ).to(self.device)
+
+            gc = compute_game_completion(gc_preds, legal_mask_t, gc_loss_mask, gc_game_lengths)
+            avg["val/game_completion_rate"] = gc["game_completion_rate"]
+            avg["val/avg_pct_completion"] = gc["avg_pct_completion"]
+            avg["val/avg_plies_completed"] = gc["avg_plies_completed"]
+
+            del gc_input, gc_loss_mask, gc_game_lengths, legal_mask_t, gc_logits, gc_preds
+            if self.device != "cpu" and torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
         return avg
 
     def train(self):
@@ -602,6 +699,11 @@ def _graceful_exit(signum, frame):
                         val_msg += f" | legal {val_metrics['val/legal_move_rate']:.3f}"
                     if "val/late_legal_move_rate" in val_metrics:
                         val_msg += f" | late_legal {val_metrics['val/late_legal_move_rate']:.3f}"
+                    if "val/game_completion_rate" in val_metrics:
+                        val_msg += (
+                            f" | complete {val_metrics['val/game_completion_rate']:.3f}"
+                            f" | avg_ply {val_metrics['val/avg_plies_completed']:.0f}"
+                        )
 
                     # Compound early stopping
                     extra_log: dict[str, object] = {}
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1093,8 +1093,78 @@ def _collect_cpu_cache() -> dict[str, str]:
     return cache
 
 
+def _cgroup_cpu_count() -> int | None:
+    """Return container CPU limit from cgroups, or None if unconstrained."""
+    import math as _math
+    # cgroup v2: cpu.max contains "quota period" (e.g. "200000 100000" = 2 CPUs)
+    try:
+        text = Path("/sys/fs/cgroup/cpu.max").read_text().strip()
+        quota_s, period_s = text.split()
+        if quota_s != "max":
+            return max(1, _math.ceil(int(quota_s) / int(period_s)))
+    except (OSError, ValueError):
+        pass
+
+    # cgroup v1: cpu.cfs_quota_us / cpu.cfs_period_us
+    try:
+        quota = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read_text().strip())
+        if quota > 0:
+            period = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read_text().strip())
+            return max(1, _math.ceil(quota / period))
+    except (OSError, ValueError):
+        pass
+
+    # cpuset: count the CPUs in the effective cpuset
+    for p in ("/sys/fs/cgroup/cpuset.cpus.effective",      # v2
+              "/sys/fs/cgroup/cpuset/cpuset.cpus"):         # v1
+        try:
+            text = Path(p).read_text().strip()
+            if text:
+                # Parse ranges like "0-3,8-11" → count individual CPUs
+                count = 0
+                for part in text.split(","):
+                    if "-" in part:
+                        lo, hi = part.split("-", 1)
+                        count += int(hi) - int(lo) + 1
+                    else:
+                        count += 1
+                return count
+        except (OSError, ValueError):
+            pass
+
+    return None
+
+
+def _cgroup_memory_bytes() -> int | None:
+    """Return container memory limit from cgroups, or None if unconstrained."""
+    # cgroup v2
+    try:
+        text = Path("/sys/fs/cgroup/memory.max").read_text().strip()
+        if text != "max":
+            return int(text)
+    except (OSError, ValueError):
+        pass
+
+    # cgroup v1
+    try:
+        limit = int(Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
+                     .read_text().strip())
+        # Kernel uses a huge sentinel (~2^63) when unconstrained
+        if limit < 2**62:
+            return limit
+    except (OSError, ValueError):
+        pass
+
+    return None
+
+
 def _collect_system_info() -> dict:
-    """Collect CPU, RAM, and cache info."""
+    """Collect CPU, RAM, and cache info.
+
+    In containers (RunPod, Docker), /proc/cpuinfo and /proc/meminfo report
+    host-level resources.  We check cgroup limits first and prefer those
+    when they indicate a constrained environment.
+    """
     import multiprocessing
 
     cpu_name = ""
@@ -1122,25 +1192,34 @@ def _collect_system_info() -> dict:
     except (OSError, ValueError):
         pass  # keep /proc/cpuinfo MHz if available
 
-    try:
-        cpu_count = len(os.sched_getaffinity(0))
-    except (AttributeError, OSError):
-        cpu_count = multiprocessing.cpu_count() or 0
-
-    # System RAM
-    ram_gb = 0.0
-    try:
-        import psutil
-        ram_gb = psutil.virtual_memory().total / (1024**3)
-    except ImportError:
+    # CPU count: prefer cgroup limit over host-visible CPUs
+    cg_cpus = _cgroup_cpu_count()
+    if cg_cpus is not None:
+        cpu_count = cg_cpus
+    else:
         try:
-            with open("/proc/meminfo") as f:
-                for line in f:
-                    if line.startswith("MemTotal:"):
-                        ram_gb = int(line.split()[1]) / (1024**2)
-                        break
-        except OSError:
-            pass
+            cpu_count = len(os.sched_getaffinity(0))
+        except (AttributeError, OSError):
+            cpu_count = multiprocessing.cpu_count() or 0
+
+    # System RAM: prefer cgroup limit over host total
+    cg_mem = _cgroup_memory_bytes()
+    if cg_mem is not None:
+        ram_gb = cg_mem / (1024**3)
+    else:
+        ram_gb = 0.0
+        try:
+            import psutil
+            ram_gb = psutil.virtual_memory().total / (1024**3)
+        except ImportError:
+            try:
+                with open("/proc/meminfo") as f:
+                    for line in f:
+                        if line.startswith("MemTotal:"):
+                            ram_gb = int(line.split()[1]) / (1024**2)
+                            break
+            except OSError:
+                pass
 
     info: dict = {
         "python": sys.version.split()[0],
@@ -1411,8 +1490,10 @@ def main():
     # Iteration control
     parser.add_argument("--n-iter", type=int, default=10,
                         help="Timed iterations per benchmark (default: 10)")
-    parser.add_argument("--n-warmup", type=int, default=3,
-                        help="Warmup iterations per benchmark (default: 3)")
+    parser.add_argument("--n-warmup", type=int, default=10,
+                        help="Warmup iterations per benchmark (default: 10). "
+                             "torch.compile may need 5-10+ steps to fully optimize; "
+                             "too few warmup steps inflates timed results.")
 
     # Output
     parser.add_argument("--json", type=str, default=None,
diff --git a/scripts/compute_theoretical_ceiling.py b/scripts/compute_theoretical_ceiling.py
@@ -77,6 +77,8 @@ def main():
     parser.add_argument("--output", type=str, default="data/theoretical_ceiling.json")
     parser.add_argument("--model-accuracy", type=float, default=None,
                         help="Model top-1 accuracy to compute adjusted score")
+    parser.add_argument("--max-ply", type=int, default=255,
+                        help="Maximum game length in plies (default: 255)")
     parser.add_argument("--bootstrap", type=int, default=2000,
                         help="Number of bootstrap resamples for CIs (0 to skip)")
     args = parser.parse_args()
@@ -116,7 +118,7 @@ def main():
         bt = time.time()
         result = engine.compute_accuracy_ceiling(
             n_games=batch_n,
-            max_ply=255,
+            max_ply=args.max_ply,
             n_rollouts=args.rollouts,
             sample_rate=args.sample_rate,
             seed=batch_seed,
diff --git a/tests/model/test_512_token.py b/tests/model/test_512_token.py
@@ -378,13 +378,13 @@ class TestPlyRangeFilter:
     def setup(self):
         """Generate val data with known legal grids at seq_len=64.
 
-        Uses ``input_ids`` as predictions — input_ids[p] is the actual move
-        at ply p (no outcome prefix), aligning with legal_grid[p].  This gives
-        a legal rate of ~1.0 for move positions (the ground truth is always legal).
+        Uses ``targets`` as predictions — legal_grid is shifted by one ply
+        in create_validation_set to align with targets (target[p] is the move
+        at ply p+1).  This gives a legal rate of ~1.0 for move positions.
         """
         val = create_validation_set(n_games=16, max_ply=64, seed=42)
-        # input_ids[p] = the move actually played at ply p → always legal
-        val["preds"] = val["input_ids"].clone()
+        # targets[p] = the next move (ply p+1), aligned with legal_grid[p]
+        val["preds"] = val["targets"].clone()
         return val
 
     @pytest.mark.integration