From ca23b944ee80ee0fea03812e9650179c345b1785 Mon Sep 17 00:00:00 2001
From: Tyler Bessire <134957105+tylerbessire@users.noreply.github.com>
Date: Sun, 14 Sep 2025 04:38:59 -0700
Subject: [PATCH] docs: refresh roadmap with phase milestones

---
 AGENTS.md               | 599 +++++-----------------------------------
 tests/test_canonical.py |  61 ++++
 2 files changed, 124 insertions(+), 536 deletions(-)
 create mode 100644 tests/test_canonical.py

diff --git a/AGENTS.md b/AGENTS.md
index 754d3ae..3a47f1a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,564 +1,91 @@
-# AGENTS.md - Step-by-Step ARC Solver Enhancement Guide
+# AGENTS.md - PUMA Development Roadmap
 
-## 🎯 MISSION: Transform ARC Solver from 0% to Competition-Ready Fluid Intelligence System
+## Mission
+Develop a competition-ready ARC solver with measurable progress at each milestone.
 
-This document provides a comprehensive, step-by-step implementation guide for enhancing the ARC solver. **FOLLOW EACH STEP IN ORDER** and mark your progress after completing each step.
+## Ethos
+1. Work sequentially by milestone.
+2. After completing a task, record a progress marker.
+3. Run tests and document results.
+4. Keep implementations production-grade; no partial stubs.
 
 ---
 
-## 📋 IMPLEMENTATION PHASES
+## Phase Plan
 
-### PHASE 1: CRITICAL FIXES (Get to Working Baseline)
-**Goal**: Fix fatal bugs preventing any solutions from being generated
-**Target**: Get from 0% to 5-15% accuracy on evaluation set
+### M0 — Repo health (1 day)
+Goal: Verify local environment and baseline runnability.
+Checklist:
+- Makefile targets run locally.
+- CI green on mock data; `pytest` passes.
+- `arc_submit.py` produces a submission JSON.
 
-### PHASE 2: CORE INTELLIGENCE (Add Fluid Reasoning)
-**Goal**: Implement missing cognitive mechanisms for human-level reasoning
-**Target**: Achieve 25-40% accuracy with proper reasoning traces
+### M1 — Data foundation (1–2 days)
+Goal: Build canonicalised training dataset.
+Checklist:
+- `prep_build_dataset.py` fills `train_X.npy` and `train_Y.npy`.
+- Canonicalisation verified by `test_canonical.py` (rotations/reflections hash-equal).
 
-### PHASE 3: LEARNING ENHANCEMENT (Unlock Learning Potential)
-**Goal**: Build robust training and adaptation systems
-**Target**: Reach 50-70% accuracy through learned patterns
+### M2 — Baseline guidance (1–2 days)
+Goal: Train neural guidance to cut search.
+Checklist:
+- `NeuralGuidance` reaches ≥ micro-F1 0.55@top-k.
+- `integrate_stack.py` reduces node expansions ≥30% vs unguided.
 
-### PHASE 4: COMPETITION OPTIMIZATION (Maximize Performance)
-**Goal**: Optimize for ARC Prize 2025 competition constraints
-**Target**: Achieve 80%+ accuracy with reliable performance
+### M3 — Facts & relational sketches (2–3 days)
+Goal: Mine facts and program sketches.
+Checklist:
+- `facts.jsonl` coverage ≥95%; schema frozen.
+- `sketches.json` mined; top-20 macros explain ≥60% programs.
 
----
-
-# PHASE 1: CRITICAL FIXES ⚡
-
-## Step 1.1: Fix DSL Operation Parameter Generation
-
-**ISSUE**: DSL operations missing required arguments, causing 100% program execution failures
-
-**FILES TO MODIFY**:
-- `arc_solver/search.py` (lines ~30-50)
-- `arc_solver/enhanced_search.py` (parameter generation sections)
-
-**SPECIFIC FIXES NEEDED**:
-
-1. **Fix crop operation parameters**:
-```python
-# BROKEN (current):
-"crop": [{}]
-
-# FIX TO:
-"crop": [{"top": t, "left": l, "height": h, "width": w} 
-         for t in range(0, 3) for l in range(0, 3) 
-         for h in range(1, 4) for w in range(1, 4)]
-```
-
-2. **Fix pad operation parameters**:
-```python
-# BROKEN (current):
-"pad": [{}]
-
-# FIX TO:
-"pad": [{"out_h": h, "out_w": w} 
-        for h in range(5, 20) for w in range(5, 20)]
-```
-
-3. **Fix recolor operation parameters**:
-```python
-# BROKEN (current):
-"recolor": [{}]
-
-# FIX TO:
-"recolor": [{"mapping": {i: j}} 
-           for i in range(10) for j in range(10) if i != j]
-```
-
-**VALIDATION**: Run evaluation script - should see actual program attempts instead of parameter errors
-
-**PROGRESS MARKER**: 
-```
-[X] Step 1.1 COMPLETED - DSL parameters fixed, no more "missing required arguments" errors
-    Date: 2025-09-11
-    Test Result: 0% success on 1 eval task (no parameter errors)
-    Notes: Parameter enumeration for crop/pad/recolor verified; unit tests pass
-```
-
----
-
-## Step 1.2: Fix Solver Result Collection
-
-**ISSUE**: `solve_task()` claims success but returns empty test results
-
-**FILES TO MODIFY**:
-- `arc_solver/solver.py` (around lines 45-70)
-
-**SPECIFIC FIXES NEEDED**:
-
-1. **Debug result dictionary structure**:
-```python
-# In solve_task method, ensure proper result collection:
-def solve_task(self, task: Dict[str, List[Dict[str, List[List[int]]]]]) -> Dict[str, List[List[List[int]]]]:
-    # ... existing code ...
-    
-    # ENSURE this section properly collects results:
-    test_predictions = []
-    for test_input in test_inputs:
-        # Get predictions from enhanced or baseline search
-        predictions = self._get_predictions(train_pairs, test_input)
-        if predictions:
-            test_predictions.append(predictions[0])  # Take first prediction
-        else:
-            test_predictions.append([])  # Empty fallback
-    
-    return {"test": test_predictions}
-```
-
-2. **Fix prediction collection logic**:
-- Ensure `_get_predictions` method actually returns predictions
-- Add debug logging to trace where predictions are lost
-- Verify test input processing pipeline
-
-**VALIDATION**: Solver should return non-empty test results for at least some tasks
-
-**PROGRESS MARKER**: 
-```
-[X] Step 1.2 COMPLETED - Solver returns actual predictions instead of empty results
-    Date: 2025-09-11
-    Test Result: produced non-empty outputs on sample rotation task; schema tests pass
-    Notes: Added per-input prediction collection with baseline fallback
-```
-
----
-
-## Step 1.3: Fix Array Comparison Errors
-
-**ISSUE**: Numpy array comparison failures in baseline solver
-
-**FILES TO MODIFY**:
-- `arc_solver/search.py` (array equality checks)
-- `arc_solver/grid.py` (eq function)
-
-**SPECIFIC FIXES NEEDED**:
-
-1. **Fix array equality comparisons**:
-```python
-# In search.py, replace problematic comparisons:
-# BROKEN:
-if predicted == expected:
-
-# FIX TO:
-if isinstance(predicted, np.ndarray) and isinstance(expected, np.ndarray):
-    if predicted.shape == expected.shape and np.array_equal(predicted, expected):
-        # Match found
-elif predicted == expected:
-    # Non-array comparison
-```
-
-2. **Fix broadcasting errors**:
-- Add shape validation before array operations
-- Handle mismatched dimensions gracefully
-- Add proper error handling for malformed grids
-
-**VALIDATION**: Baseline solver should run without numpy errors
-
-**PROGRESS MARKER**: 
-```
-[X] Step 1.3 COMPLETED - No more array comparison or broadcasting errors
-    Date: 2025-02-14
-    Test Result: pytest 98 passed
-    Notes: Added robust array equality and duplicate attempt fallback
-```
-
----
-
-## Step 1.4: Validate Phase 1 Completion
-
-**GOAL**: Confirm all critical bugs are fixed
-
-**VALIDATION SCRIPT**:
-```python
-# Run this test to confirm Phase 1 completion:
-python3 -c "
-from arc_solver.solver import ARCSolver
-import json
-
-# Load test data
-with open('data/arc-agi_evaluation_challenges.json', 'r') as f:
-    challenges = json.load(f)
-
-solver = ARCSolver()
-task_id = list(challenges.keys())[0]
-result = solver.solve_task(challenges[task_id])
-
-print(f'Task {task_id}:')
-print(f'  Results returned: {len(result.get(\"test\", []))}')
-print(f'  Non-empty results: {sum(1 for r in result.get(\"test\", []) if r)}')
-print('Phase 1 SUCCESS if no errors above and non-empty results > 0')
-"
-```
-
-**PROGRESS MARKER**: 
-```
-[ ] PHASE 1 COMPLETED - Critical bugs fixed, solver produces actual attempts
-    Date: ___________
-    Final Test Result: ___% accuracy (target: > 0%)
-    Ready for Phase 2: [ ] YES / [ ] NO
-    Notes: ________________________________
-```
-
----
-
-# PHASE 2: CORE INTELLIGENCE 🧠
-
-## Step 2.1: Implement Hypothesis Generation Framework
-
-**GOAL**: Add explicit hypothesis formation - core of fluid intelligence
-
-**NEW FILE TO CREATE**: `arc_solver/hypothesis.py`
-
-**IMPLEMENTATION**:
-```python
-"""
-Hypothesis generation and testing for fluid intelligence in ARC tasks.
-"""
-
-from dataclasses import dataclass
-from typing import List, Dict, Any, Optional, Tuple
-import numpy as np
-from .grid import Array
-
-@dataclass
-class Hypothesis:
-    """Represents a hypothesis about task transformation."""
-    description: str
-    transformation_type: str  # "rotation", "color_swap", "pattern_fill", etc.
-    confidence: float
-    evidence: List[Dict[str, Any]]
-    program_sketch: Optional[List[Tuple[str, Dict[str, Any]]]] = None
-    
-class HypothesisEngine:
-    """Generates and tests hypotheses about ARC task transformations."""
-    
-    def generate_hypotheses(self, train_pairs: List[Tuple[Array, Array]]) -> List[Hypothesis]:
-        """Generate multiple competing hypotheses about the task transformation."""
-        hypotheses = []
-        
-        # 1. Geometric transformation hypotheses
-        hypotheses.extend(self._generate_geometric_hypotheses(train_pairs))
-        
-        # 2. Color transformation hypotheses  
-        hypotheses.extend(self._generate_color_hypotheses(train_pairs))
-        
-        # 3. Pattern completion hypotheses
-        hypotheses.extend(self._generate_pattern_hypotheses(train_pairs))
-        
-        # 4. Object manipulation hypotheses
-        hypotheses.extend(self._generate_object_hypotheses(train_pairs))
-        
-        return sorted(hypotheses, key=lambda h: h.confidence, reverse=True)
-    
-    def test_hypothesis(self, hypothesis: Hypothesis, train_pairs: List[Tuple[Array, Array]]) -> float:
-        """Test hypothesis validity against training data."""
-        # Implement hypothesis testing logic
-        pass
-    
-    def refine_hypothesis(self, hypothesis: Hypothesis, feedback: Dict) -> Hypothesis:
-        """Refine hypothesis based on test results."""
-        # Implement hypothesis refinement logic
-        pass
-```
-
-**INTEGRATION POINTS**:
-- Add to `arc_solver/solver.py` as primary reasoning layer
-- Connect with episodic retrieval for hypothesis seeding
-- Link with neural guidance for hypothesis scoring
-
-**PROGRESS MARKER**: 
-```
-[X] Step 2.1 COMPLETED - Hypothesis generation framework implemented
-    Date: 2024-12-08
-    Test Result: Can generate hypotheses for test tasks
-    Notes: Basic geometric, color, pattern, and translation hypotheses added
-```
-
----
-
-## Step 2.2: Enhance Analogical Reasoning
-
-**GOAL**: Upgrade episodic retrieval with deep analogical mapping
-
-**FILES TO MODIFY**:
-- `arc_solver/neural/episodic.py` (enhance existing implementation)
-
-**NEW CLASS TO ADD**:
-```python
-class AnalogicalReasoner:
-    """Advanced analogical reasoning for ARC tasks."""
-    
-    def find_structural_analogies(self, current_task: Task, memory: EpisodicMemory) -> List[Analogy]:
-        """Find tasks with similar abstract structure, not just surface features."""
-        # Implement structural similarity matching
-        pass
-        
-    def map_solution_structure(self, source_solution: Program, target_task: Task) -> Program:
-        """Map solution from analogous task to current task."""
-        # Implement solution transfer logic
-        pass
-        
-    def abstract_common_patterns(self, similar_tasks: List[Task]) -> AbstractPattern:
-        """Extract abstract transformation rules from multiple similar tasks."""
-        # Implement pattern abstraction
-        pass
-```
-
-**PROGRESS MARKER**:
-```
-[X] Step 2.2 COMPLETED - Analogical reasoning enhanced beyond surface similarity
-    Date: 2024-12-08
-    Test Result: Analogical reasoner retrieves similar episodes in unit tests
-    Notes: Initial structural similarity and mapping implemented
-```
-
----
-
-## Step 2.3: Add Meta-Cognitive Monitoring
-
-**GOAL**: System monitors its own reasoning and adapts strategies
-
-**NEW FILE TO CREATE**: `arc_solver/metacognition.py`
-
-**IMPLEMENTATION**:
-```python
-class MetaCognition:
-    """Meta-cognitive monitoring and strategy adaptation."""
-    
-    def monitor_solving_progress(self, attempts: List[Program], success_rate: float) -> Strategy:
-        """Monitor solving attempts and suggest strategy changes."""
-        pass
-        
-    def assess_confidence(self, solution: Program, task: Task) -> float:
-        """Assess confidence in proposed solution."""
-        pass
-        
-    def switch_strategy(self, current_strategy: Strategy, performance: Dict) -> Strategy:
-        """Switch reasoning strategy based on performance."""
-        pass
-```
-
-**PROGRESS MARKER**: 
-```
-[ ] Step 2.3 COMPLETED - Meta-cognitive monitoring system active
-    Date: ___________
-    Test Result: System adapts strategies based on performance
-    Notes: ________________________________
-```
-
----
-
-## Step 2.4: Validate Phase 2 Completion
-
-**VALIDATION**: System should show reasoning traces and adapt behavior
-
-**PROGRESS MARKER**: 
-```
-[ ] PHASE 2 COMPLETED - Core fluid intelligence mechanisms implemented
-    Date: ___________
-    Final Test Result: ___% accuracy (target: 25-40%)
-    Ready for Phase 3: [ ] YES / [ ] NO
-    Notes: ________________________________
-```
-
----
-
-# PHASE 3: LEARNING ENHANCEMENT 📚
-
-## Step 3.1: Build Self-Supervised Learning Pipeline
-
-**GOAL**: Enable system to learn from experience and improve over time
+### M4 — Episodic memory online (1 day)
+Goal: Retrieval speeds up solving.
+Checklist:
+- `episodes.json` built; retrieval hit-rate ≥40%; solve time ↓ ≥20%.
 
-**FILES TO MODIFY**:
-- `tools/train_guidance.py` (enhance existing training)
+### M5 — Full stack solve (2 days)
+Goal: Enhanced solver > baseline by 8–12% absolute.
+Checklist:
+- Diversity-2 attempts comply with ARC rules and improve pass@2.
 
-**NEW COMPONENTS TO ADD**:
-- Data augmentation for synthetic task generation
-- Curriculum learning for progressive difficulty
-- Continual learning without catastrophic forgetting
+### M6 — Test-time adaptation (2 days)
+Goal: TTT improves borderline tasks.
+Checklist:
+- `adapt_test_time.py` improves mini eval ≥3% with runtime ≤30s median.
 
-**PROGRESS MARKER**: 
-```
-[ ] Step 3.1 COMPLETED - Self-supervised learning pipeline operational
-    Date: ___________
-    Test Result: Model improves with additional training
-    Notes: ________________________________
-```
+### M7 — Public eval harness (ongoing)
+Goal: Nightly evaluation tooling.
+Checklist:
+- `scripts/eval_public.sh` and `tools/benchmark.py` produce reports with timing and failures.
 
 ---
 
-## Step 3.2: Enhance Program Sketch Learning
-
-**GOAL**: Learn hierarchical program structures and reusable components
+## Progress Ledger
+Record completion as:
 
-**FILES TO MODIFY**:
-- `arc_solver/neural/sketches.py`
-
-**PROGRESS MARKER**: 
 ```
-[ ] Step 3.2 COMPLETED - Advanced program sketch learning implemented
-    Date: ___________
-    Test Result: ___% accuracy from learned sketches
-    Notes: ________________________________
+[X] Milestone: short description
+    Date: YYYY-MM-DD
+    Test Result: command + outcome
+    Notes: details
 ```
 
----
-
-## Step 3.3: Upgrade Episodic Memory
-
-**GOAL**: Hierarchical memory organization and consolidation
-
-**FILES TO MODIFY**:
-- `arc_solver/neural/episodic.py`
-
-**PROGRESS MARKER**: 
+### Completed
 ```
-[X] Step 3.3 COMPLETED - Advanced episodic memory system operational
-    Date: 2024-06-02
-    Test Result: `pytest tests/test_memory.py` passed
-    Notes: Added hierarchical indexing and consolidation
-```
-
----
-
-## Step 3.4: Validate Phase 3 Completion
-
-**PROGRESS MARKER**: 
-```
-[X] PHASE 3 COMPLETED - Learning systems unlock performance potential
-    Date: 2024-06-02
-    Final Test Result: Unit tests pass
-    Ready for Phase 4: [X] YES / [ ] NO
-    Notes: Hierarchical episodic memory in place
-```
-
----
-
-# PHASE 4: COMPETITION OPTIMIZATION 🏆
-
-## Step 4.1: Advanced Search Strategies
-
-**GOAL**: Implement beam search, MCTS, and constraint propagation
-
-**PROGRESS MARKER**: 
-```
-[X] Step 4.1 COMPLETED - Advanced search strategies implemented
-    Date: 2025-09-12
-    Test Result: pytest tests/test_beam_search.py passed
-    Notes: Added beam search with constraint propagation and MCTS search
-```
-
----
-
-## Step 4.2: Multi-Modal Reasoning
-
-**GOAL**: Ensemble methods and voting mechanisms
-
-**PROGRESS MARKER**: 
-```
-[X] Step 4.2 COMPLETED - Multi-modal reasoning system operational
-    Date: 2025-09-12
-    Test Result: pytest tests/test_episodic_integration.py passed; python tools/train_guidance_on_arc.py --epochs 1
-    Notes: Enhanced/baseline ensemble with beam priors; guidance trained on train+eval datasets
-```
-
----
-
-## Step 4.3: Competition-Specific Optimizations
-
-**GOAL**: Two-attempt diversity, resource management, deterministic execution
-
-**PROGRESS MARKER**: 
-```
-[ ] Step 4.3 COMPLETED - Competition optimizations implemented
-    Date: 2024-06-03
-    Test Result: beam_search op_scores, deterministic two attempts
-    Notes: Resource limits and diversity enforced
-```
-
-```
-[X] Step 4.3 UPDATE - Recolor parameter mismatch fixed preventing training failures
-    Date: 2025-09-12
-    Test Result: pytest tests/test_recolor_fix.py passed
-    Notes: Standardised 'mapping' parameter across heuristics; episodic loader normalises keys
-
-[X] Step 4.3 UPDATE2 - Translate parameter mismatch fixed preventing training warnings
-    Date: 2025-09-13
-    Test Result: pytest tests/test_translate_fix.py passed; python tools/train_guidance_on_arc.py --epochs 1
-    Notes: Canonicalised 'fill' parameter for translate; legacy 'fill_value' still accepted
-[X] Step 4.3 UPDATE3 - Translate/recolor params normalised to integers preventing training failures
-    Date: 2025-09-13
-    Test Result: pytest tests/test_translate_fix.py tests/test_recolor_fix.py -q
-    Notes: Episode loader and DSL cast dy/dx/fill and mapping entries to int
-
-
-[X] Step 4.3 UPDATE4 - Submission script handles memory errors with fallback
-    Date: 2025-09-13
-    Test Result: pytest tests/test_solve_with_budget_memory.py -q
-    Notes: solve_with_budget catches MemoryError, reports memerror count, runs gc per task
-
-[X] Step 4.3 UPDATE5 - Public eval runner and Makefile convenience added
-    Date: 2025-09-13
-    Test Result: pytest tests/test_eval_public_script.py -q
-    Notes: Chunked evaluation with memory guards
-
-[X] Step 4.3 UPDATE6 - Public eval runner handoff documented
+[X] M1: D4 canonicalisation validated
     Date: 2025-09-14
-    Test Result: pytest tests/test_eval_public_script.py -q
-    Notes: Added HANDOFF.md with runbook
-
-
-```
-
----
-
-## Step 4.4: Final Validation
-
-**PROGRESS MARKER**: 
-```
-[ ] PHASE 4 COMPLETED - Competition-ready ARC solver with fluid intelligence
-    Date: 2024-06-03
-    Final Test Result: unit tests pass
-    Competition Ready: [ ] YES / [X] NO
-    Notes: Further accuracy tuning needed
+    Test Result: pytest tests/test_canonical.py -q
+    Notes: Property-based invariance checks for D4 symmetries and colour relabeling
 ```
 
 ---
 
-# 🚨 CRITICAL INSTRUCTIONS FOR AI AGENT
-
-## Mandatory Process:
-1. **Work on ONE STEP at a time** - Do not skip ahead
-2. **Complete each step fully** before moving to the next
-3. **Fill in EVERY progress marker** when you complete a step
-4. **Test your implementation** after each step
-5. **Document any issues** in the Notes section
-
-## After Each Step:
-```
-ALWAYS add your progress marker like this:
-
-[X] Step X.X COMPLETED - [Brief description of what was implemented]
-    Date: 2024-MM-DD
-    Test Result: [Specific test results or accuracy improvement]
-    Notes: [Any issues, observations, or important details]
-```
-
-## Before Starting Phase 2, 3, or 4:
-- **Verify ALL previous steps are marked complete**
-- **Confirm test results meet the phase targets**
-- **Do not proceed if previous phase is incomplete**
-
-## Emergency Procedures:
-- If a step fails or causes regressions, **STOP** and fix before proceeding
-- If test accuracy decreases, **investigate and resolve** before continuing
-- If you encounter issues beyond the scope of these instructions, **document thoroughly** and request guidance
-
----
+## Working Protocol
+1. Work on one milestone at a time.
+2. Validate each checklist item with tests or benchmarks.
+3. Update the progress ledger after validation.
+4. If regressions occur, halt and resolve before proceeding.
 
-**SUCCESS CRITERIA**: System achieves 80%+ accuracy on ARC evaluation set with clear reasoning traces and adaptive behavior demonstrating fluid intelligence.
+**Success Criterion**: 80%+ accuracy on ARC evaluation set with clear reasoning traces and adaptive behaviour.
 
-**START HERE**: Begin with Step 1.1 - Fix DSL Operation Parameter Generation
+Start with M0.
diff --git a/tests/test_canonical.py b/tests/test_canonical.py
new file mode 100644
index 0000000..0a4b0f4
--- /dev/null
+++ b/tests/test_canonical.py
@@ -0,0 +1,61 @@
+"""Tests for canonicalisation utilities.
+
+[S:TEST v1] unit=4 property=2 pass
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Ensure project root on path for direct test execution
+sys.path.append(str(Path(__file__).resolve().parents[1]))
+
+import numpy as np
+import pytest
+from hypothesis import given, strategies as st
+import hypothesis.extra.numpy as hnp
+
+from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4
+
+Array = np.ndarray
+
+# Strategy for generating small grids with colours 0-9
+array_shapes = hnp.array_shapes(min_dims=2, max_dims=2, min_side=1, max_side=5)
+colour_arrays = hnp.arrays(np.int16, array_shapes, elements=st.integers(min_value=0, max_value=9))
+
+
+def test_canonicalize_colors_type_checks() -> None:
+    """canonicalize_colors rejects non-arrays and non-integer dtypes."""
+    with pytest.raises(TypeError):
+        canonicalize_colors([1, 2, 3])
+    with pytest.raises(TypeError):
+        canonicalize_colors(np.array([[0.5]]))
+
+
+@given(colour_arrays)
+def test_canonicalize_colors_frequency_order(grid: Array) -> None:
+    """Colours are relabelled in descending frequency order with contiguous labels."""
+    canonical, mapping = canonicalize_colors(grid)
+    assert sorted(np.unique(canonical).tolist()) == list(range(len(mapping)))
+    vals, counts = np.unique(grid, return_counts=True)
+    expected_order = [int(v) for v, _ in sorted(zip(vals, counts), key=lambda t: (-t[1], t[0]))]
+    assert list(mapping.keys()) == expected_order
+
+
+def test_canonicalize_D4_type_checks() -> None:
+    """canonicalize_D4 rejects non-arrays and non-integer dtypes."""
+    with pytest.raises(TypeError):
+        canonicalize_D4([1, 2, 3])
+    with pytest.raises(TypeError):
+        canonicalize_D4(np.array([[0.5]]))
+
+
+@given(colour_arrays)
+def test_canonicalize_D4_invariance(grid: Array) -> None:
+    """Canonical form is invariant under D4 transformations and idempotent."""
+    canonical = canonicalize_D4(grid)
+    for transform in D4:
+        transformed = transform(grid)
+        assert np.array_equal(canonicalize_D4(transformed), canonical)
+    assert np.array_equal(canonicalize_D4(canonical), canonical)