thomas-schweich
diff --git a/‎.github/workflows/sync-model-cards.yml‎
Lines changed: 45 additions & 0 deletions b/‎.github/workflows/sync-model-cards.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎.github/workflows/sync-to-hf.yml‎
Lines changed: 0 additions & 18 deletions b/‎.github/workflows/sync-to-hf.yml‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎README.md‎
Lines changed: 0 additions & 26 deletions b/‎README.md‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎cards/hf_model_card.md.j2‎
Lines changed: 240 additions & 0 deletions b/‎cards/hf_model_card.md.j2‎
Lines changed: 240 additions & 0 deletions
@@ -0,0 +1,45 @@
+name: Sync Model Cards to HuggingFace
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'cards/model/*.md'
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Upload model cards to HuggingFace
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          pip install --quiet huggingface-hub
+          python3 -c "
+          from huggingface_hub import HfApi
+          from pathlib import Path
+
+          api = HfApi()
+          cards_dir = Path('cards/model')
+
+          variant_repos = {
+              'pawn-small.md': 'thomas-schweich/pawn-small',
+              'pawn-base.md': 'thomas-schweich/pawn-base',
+              'pawn-large.md': 'thomas-schweich/pawn-large',
+          }
+
+          for filename, repo in variant_repos.items():
+              card_path = cards_dir / filename
+              if card_path.exists():
+                  api.upload_file(
+                      path_or_fileobj=str(card_path),
+                      path_in_repo='README.md',
+                      repo_id=repo,
+                      repo_type='model',
+                  )
+                  print(f'Uploaded {filename} -> {repo}')
+              else:
+                  print(f'Skipped {filename} (not found)')
+          "
@@ -1,29 +1,3 @@
----
-library_name: pawn
-license: apache-2.0
-tags:
-  - chess
-  - transformer
-  - world-model
-  - causal-lm
-  - next-token-prediction
-  - representation-learning
-  - parameter-efficient-finetuning
-  - pytorch
-  - rust
-language:
-  - en
-pipeline_tag: other
-citation: |
-  @software{schweich2026pawn,
-    author = {Schweich, Thomas},
-    title = {{PAWN}: Playstyle-Agnostic World-model Network for Chess},
-    year = 2026,
-    url = {https://github.com/thomas-schweich/PAWN},
-    license = {Apache-2.0}
-  }
----
-
 # PAWN: Playstyle-Agnostic World-model Network for Chess
 
 A small causal transformer trained on random chess games that learns legal moves, board state representations, and game dynamics purely from random legal move sequences absent any form of strategic play.
 
@@ -0,0 +1,240 @@
+---
+library_name: pawn
+license: apache-2.0
+base_model:
+  - thomas-schweich/pawn-small
+  - thomas-schweich/pawn-base
+  - thomas-schweich/pawn-large
+tags:
+  - chess
+  - transformer
+  - world-model
+  - causal-lm
+  - next-token-prediction
+  - representation-learning
+  - pytorch
+  - rust
+model_name: PAWN-{{ variant_name }}
+pipeline_tag: other
+citation: |
+  {% raw %}@software{schweich2026pawn,
+    author = {Schweich, Thomas},
+    title = {{PAWN}: Playstyle-Agnostic World-model Network for Chess},
+    year = {2026},
+    url = {https://github.com/thomas-schweich/PAWN},
+    license = {Apache-2.0}
+  }{% endraw %}
+model_params: {{ params_num }}
+d_model: {{ d_model }}
+n_layers: {{ n_layers }}
+n_heads: {{ n_heads }}
+d_ff: {{ d_ff }}
+context_length: 256
+vocab_size: 4284
+datasets:
+  - random-chess-games
+language:
+  - en
+metrics:
+  - accuracy
+model-index:
+  - name: PAWN-{{ variant_name }}
+    results:
+      - task:
+          type: next-token-prediction
+          name: Chess Move Prediction (Random Games)
+        metrics:
+{% if legal_rate is not none %}
+          - name: Legal Move Rate
+            type: accuracy
+            value: {{ "%.4f"|format(legal_rate / 100) }}
+{% endif %}
+          - name: Top-1 Accuracy
+            type: accuracy
+            value: {{ "%.4f"|format(top1 / 100) }}
+{% if top5 is not none %}
+          - name: Top-5 Accuracy
+            type: accuracy
+            value: {{ "%.4f"|format(top5 / 100) }}
+{% endif %}
+          - name: Val Loss
+            type: loss
+            value: {{ "%.4f"|format(val_loss) }}
+          - name: Games Seen
+            type: other
+            value: 25600000
+---
+
+# PAWN-{{ variant_name }}
+
+**PAWN** (Playstyle-Agnostic World-model Network for Chess) is a causal transformer trained on random chess games. It learns legal moves, board state representations, and game dynamics purely from uniformly random legal move sequences -- no strategic play, no hand-crafted features, no external game databases.
+
+This is the **{{ variant_label }}** variant ({{ params }} parameters). PAWN is designed as a frozen backbone for parameter-efficient finetuning into player models with arbitrary playstyles.
+
+**[GitHub Repository](https://github.com/thomas-schweich/PAWN)** -- full source code, training scripts, adapter implementations, and documentation.
+
+## All Variants
+
+| Variant | Parameters | Link |
+|---------|------------|------|
+| PAWN-Small | ~9.5M | [thomas-schweich/pawn-small](https://huggingface.co/thomas-schweich/pawn-small) |
+| PAWN (Base) | ~35.8M | [thomas-schweich/pawn-base](https://huggingface.co/thomas-schweich/pawn-base) |
+| PAWN-Large | ~68.4M | [thomas-schweich/pawn-large](https://huggingface.co/thomas-schweich/pawn-large) |
+
+## Headline Metrics
+
+| Metric | Value |
+|--------|-------|
+{% if legal_rate is not none %}| Legal move rate | {{ "%.2f"|format(legal_rate) }}% |
+{% endif %}| Top-1 accuracy | {{ "%.2f"|format(top1) }}% |
+{% if top5 is not none %}| Top-5 accuracy | {{ "%.2f"|format(top5) }}% |
+{% endif %}| Val loss | {{ "%.3f"|format(val_loss) }} |
+
+### Accuracy Ratios
+
+PAWN is trained on uniformly random chess games, so top-1 accuracy has a hard theoretical ceiling. Ratios above 100% on the unconditioned ceiling indicate the model has learned structure beyond simply identifying legal moves. See [Accuracy Ceiling Analysis](https://github.com/thomas-schweich/PAWN/blob/main/docs/ACCURACY_CEILING.md).
+
+| Ceiling | Ratio |
+|---------|-------|
+| Unconditioned (E\[1/N_legal\] = {{ "%.2f"|format(uncond_ceiling) }}%) | {{ uncond_ratio }}% |
+| Naive-conditioned (1-ply filter = {{ "%.2f"|format(naive_ceiling) }}%) | {{ naive_ratio }}% |
+| Bayes-optimal conditioned (MCTS, 32 rollouts = {{ "%.2f"|format(mcts_ceiling) }}%) | {{ mcts_ratio }}% |
+{% if probes %}
+
+## Probe Results
+
+Linear probes trained on frozen hidden states measure how well the model's internal representations encode board-level features.
+
+| Probe | Accuracy | Description |
+|-------|----------|-------------|
+{% for probe in probes -%}
+| {{ probe.name }} | {{ probe.result }} | {{ probe.description }} |
+{% endfor %}
+{% endif %}
+{% if diagnostics %}
+
+## Diagnostic Results
+
+Edge-case diagnostics measure the model's legal move rate in specific tactical situations.
+
+| Category | Positions | Legal Rate |
+|----------|-----------|------------|
+{% for diag in diagnostics -%}
+| {{ diag.name }} | {{ diag.n }} | {{ diag.value }} |
+{% endfor %}
+{% endif %}
+
+## Architecture
+
+| Parameter | Value |
+|-----------|-------|
+| Architecture | Decoder-only transformer |
+| d_model | {{ d_model }} |
+| Layers | {{ n_layers }} |
+| Attention heads | {{ n_heads }} |
+| Head dimension | {{ head_dim }} |
+| d_ff | {{ d_ff }} |
+| Parameters | {{ params }} |
+| Vocabulary | 4,284 tokens |
+| Context length | 256 tokens |
+| Normalization | Pre-norm RMSNorm |
+| FFN | SwiGLU (4x expansion) |
+| Positional encoding | Rotary (RoPE, base 10000) |
+| Embeddings | Factored (src + dst + promo) |
+| Dropout | 0.0 |
+
+## Training Details
+
+| Parameter | Value |
+|-----------|-------|
+| Training data | On-the-fly uniformly random legal games (no external dataset) |
+| Objective | Next-token cross-entropy (non-padding positions only) |
+| Total steps | 100,000 |
+| Batch size | 256 |
+| Games seen | 25,600,000 |
+| Learning rate | 3e-4 (cosine decay with 1,000-step warmup) |
+| Optimizer | AdamW (weight decay 0.01) |
+| Precision | Mixed (AMP) |
+| Hardware | NVIDIA H200 |
+
+## Usage
+
+### Loading the model
+
+```python
+import torch
+from safetensors.torch import load_file
+from pawn.config import CLMConfig
+from pawn.model import PAWNCLM
+
+cfg = CLMConfig.{{ variant_factory }}()
+model = PAWNCLM(cfg).cuda().eval()
+weights = load_file("model.safetensors", device="cuda")
+model.load_state_dict(weights)
+```
+
+Or load directly from HuggingFace:
+
+```python
+from pawn.checkpoint import load_backbone_weights
+from pawn.config import CLMConfig
+from pawn.model import PAWNCLM
+
+weights, config = load_backbone_weights("thomas-schweich/pawn-{{ variant_key }}")
+cfg = CLMConfig.{{ variant_factory }}()
+model = PAWNCLM(cfg).eval()
+model.load_state_dict(weights)
+```
+
+### Finetuning with an adapter
+
+```bash
+uv run python scripts/train_bottleneck.py \
+    --checkpoint thomas-schweich/pawn-{{ variant_key }} \
+    --pgn thomas-schweich/pawn-lichess-full \
+    --bottleneck-dim 32 --lr 1e-4 --local-checkpoints
+```
+
+## Acknowledgments
+
+PAWN builds on ideas and tools from the following projects and publications:
+
+| Component | Reference |
+|-----------|-----------|
+| Transformer | [Vaswani et al., "Attention Is All You Need", NeurIPS 2017](https://arxiv.org/abs/1706.03762) |
+| RMSNorm | [Zhang & Sennrich, "Root Mean Square Layer Normalization", NeurIPS 2019](https://arxiv.org/abs/1910.07467) |
+| RoPE | [Su et al., "RoFormer: Enhanced Transformer with Rotary Position Embedding", 2021](https://arxiv.org/abs/2104.09864) |
+| SwiGLU | [Shazeer, "GLU Variants Improve Transformer", 2020](https://arxiv.org/abs/2002.05202) |
+| AdamW | [Loshchilov & Hutter, "Decoupled Weight Decay Regularization", ICLR 2019](https://arxiv.org/abs/1711.05101) |
+| Cosine schedule | [Loshchilov & Hutter, "SGDR: Stochastic Gradient Descent with Warm Restarts", ICLR 2017](https://arxiv.org/abs/1608.03983) |
+| Mixed precision | [Micikevicius et al., "Mixed Precision Training", ICLR 2018](https://arxiv.org/abs/1710.03740) |
+| Bottleneck adapters | [Houlsby et al., "Parameter-Efficient Transfer Learning for NLP", ICML 2019](https://arxiv.org/abs/1902.00751) |
+| LoRA | [Hu et al., "LoRA: Low-Rank Adaptation of Large Language Models", ICLR 2022](https://arxiv.org/abs/2106.09685) |
+| FiLM | [Perez et al., "FiLM: Visual Reasoning with a General Conditioning Layer", AAAI 2018](https://arxiv.org/abs/1709.07871) |
+| RoSA | [Nikdan et al., "RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation", 2024](https://arxiv.org/abs/2401.04679) |
+| Linear probes | [Alain & Bengio, "Understanding Intermediate Layers Using Linear Classifier Probes", ICLR Workshop 2017](https://arxiv.org/abs/1610.01644) |
+| Intrinsic dimensionality | [Aghajanyan et al., "Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning", ACL 2021](https://arxiv.org/abs/2012.13255) |
+| MAIA | [McIlroy-Young et al., "Aligning Superhuman AI with Human Behavior: Chess as a Model System", KDD 2020](https://arxiv.org/abs/2006.01855) |
+| AlphaZero | [Silver et al., "A General Reinforcement Learning Algorithm that Masters Chess, Shogi, and Go through Self-Play", Science 2018](https://arxiv.org/abs/1712.01815) |
+| Leela Chess Zero | [github.com/LeelaChessZero/lc0](https://github.com/LeelaChessZero/lc0) |
+| shakmaty | [github.com/niklasf/shakmaty](https://github.com/niklasf/shakmaty) |
+| PyO3 | [github.com/PyO3/pyo3](https://github.com/PyO3/pyo3) |
+| Lichess | [lichess.org](https://lichess.org/) / [database.lichess.org](https://database.lichess.org/) |
+
+## Citation
+
+{% raw %}
+```bibtex
+@software{schweich2026pawn,
+  author = {Schweich, Thomas},
+  title = {{PAWN}: Playstyle-Agnostic World-model Network for Chess},
+  year = {2026},
+  url = {https://github.com/thomas-schweich/PAWN},
+  license = {Apache-2.0}
+}
+```
+{% endraw %}
+
+## License
+
+Apache 2.0. See [LICENSE](https://github.com/thomas-schweich/PAWN/blob/main/LICENSE).