From ca23b944ee80ee0fea03812e9650179c345b1785 Mon Sep 17 00:00:00 2001 From: Tyler Bessire <134957105+tylerbessire@users.noreply.github.com> Date: Sun, 14 Sep 2025 04:38:59 -0700 Subject: [PATCH] docs: refresh roadmap with phase milestones --- AGENTS.md | 599 +++++----------------------------------- tests/test_canonical.py | 61 ++++ 2 files changed, 124 insertions(+), 536 deletions(-) create mode 100644 tests/test_canonical.py diff --git a/AGENTS.md b/AGENTS.md index 754d3ae..3a47f1a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,564 +1,91 @@ -# AGENTS.md - Step-by-Step ARC Solver Enhancement Guide +# AGENTS.md - PUMA Development Roadmap -## 🎯 MISSION: Transform ARC Solver from 0% to Competition-Ready Fluid Intelligence System +## Mission +Develop a competition-ready ARC solver with measurable progress at each milestone. -This document provides a comprehensive, step-by-step implementation guide for enhancing the ARC solver. **FOLLOW EACH STEP IN ORDER** and mark your progress after completing each step. +## Ethos +1. Work sequentially by milestone. +2. After completing a task, record a progress marker. +3. Run tests and document results. +4. Keep implementations production-grade; no partial stubs. --- -## 📋 IMPLEMENTATION PHASES +## Phase Plan -### PHASE 1: CRITICAL FIXES (Get to Working Baseline) -**Goal**: Fix fatal bugs preventing any solutions from being generated -**Target**: Get from 0% to 5-15% accuracy on evaluation set +### M0 — Repo health (1 day) +Goal: Verify local environment and baseline runnability. +Checklist: +- Makefile targets run locally. +- CI green on mock data; `pytest` passes. +- `arc_submit.py` produces a submission JSON. -### PHASE 2: CORE INTELLIGENCE (Add Fluid Reasoning) -**Goal**: Implement missing cognitive mechanisms for human-level reasoning -**Target**: Achieve 25-40% accuracy with proper reasoning traces +### M1 — Data foundation (1–2 days) +Goal: Build canonicalised training dataset. +Checklist: +- `prep_build_dataset.py` fills `train_X.npy` and `train_Y.npy`. +- Canonicalisation verified by `test_canonical.py` (rotations/reflections hash-equal). -### PHASE 3: LEARNING ENHANCEMENT (Unlock Learning Potential) -**Goal**: Build robust training and adaptation systems -**Target**: Reach 50-70% accuracy through learned patterns +### M2 — Baseline guidance (1–2 days) +Goal: Train neural guidance to cut search. +Checklist: +- `NeuralGuidance` reaches ≥ micro-F1 0.55@top-k. +- `integrate_stack.py` reduces node expansions ≥30% vs unguided. -### PHASE 4: COMPETITION OPTIMIZATION (Maximize Performance) -**Goal**: Optimize for ARC Prize 2025 competition constraints -**Target**: Achieve 80%+ accuracy with reliable performance +### M3 — Facts & relational sketches (2–3 days) +Goal: Mine facts and program sketches. +Checklist: +- `facts.jsonl` coverage ≥95%; schema frozen. +- `sketches.json` mined; top-20 macros explain ≥60% programs. ---- - -# PHASE 1: CRITICAL FIXES ⚡ - -## Step 1.1: Fix DSL Operation Parameter Generation - -**ISSUE**: DSL operations missing required arguments, causing 100% program execution failures - -**FILES TO MODIFY**: -- `arc_solver/search.py` (lines ~30-50) -- `arc_solver/enhanced_search.py` (parameter generation sections) - -**SPECIFIC FIXES NEEDED**: - -1. **Fix crop operation parameters**: -```python -# BROKEN (current): -"crop": [{}] - -# FIX TO: -"crop": [{"top": t, "left": l, "height": h, "width": w} - for t in range(0, 3) for l in range(0, 3) - for h in range(1, 4) for w in range(1, 4)] -``` - -2. **Fix pad operation parameters**: -```python -# BROKEN (current): -"pad": [{}] - -# FIX TO: -"pad": [{"out_h": h, "out_w": w} - for h in range(5, 20) for w in range(5, 20)] -``` - -3. **Fix recolor operation parameters**: -```python -# BROKEN (current): -"recolor": [{}] - -# FIX TO: -"recolor": [{"mapping": {i: j}} - for i in range(10) for j in range(10) if i != j] -``` - -**VALIDATION**: Run evaluation script - should see actual program attempts instead of parameter errors - -**PROGRESS MARKER**: -``` -[X] Step 1.1 COMPLETED - DSL parameters fixed, no more "missing required arguments" errors - Date: 2025-09-11 - Test Result: 0% success on 1 eval task (no parameter errors) - Notes: Parameter enumeration for crop/pad/recolor verified; unit tests pass -``` - ---- - -## Step 1.2: Fix Solver Result Collection - -**ISSUE**: `solve_task()` claims success but returns empty test results - -**FILES TO MODIFY**: -- `arc_solver/solver.py` (around lines 45-70) - -**SPECIFIC FIXES NEEDED**: - -1. **Debug result dictionary structure**: -```python -# In solve_task method, ensure proper result collection: -def solve_task(self, task: Dict[str, List[Dict[str, List[List[int]]]]]) -> Dict[str, List[List[List[int]]]]: - # ... existing code ... - - # ENSURE this section properly collects results: - test_predictions = [] - for test_input in test_inputs: - # Get predictions from enhanced or baseline search - predictions = self._get_predictions(train_pairs, test_input) - if predictions: - test_predictions.append(predictions[0]) # Take first prediction - else: - test_predictions.append([]) # Empty fallback - - return {"test": test_predictions} -``` - -2. **Fix prediction collection logic**: -- Ensure `_get_predictions` method actually returns predictions -- Add debug logging to trace where predictions are lost -- Verify test input processing pipeline - -**VALIDATION**: Solver should return non-empty test results for at least some tasks - -**PROGRESS MARKER**: -``` -[X] Step 1.2 COMPLETED - Solver returns actual predictions instead of empty results - Date: 2025-09-11 - Test Result: produced non-empty outputs on sample rotation task; schema tests pass - Notes: Added per-input prediction collection with baseline fallback -``` - ---- - -## Step 1.3: Fix Array Comparison Errors - -**ISSUE**: Numpy array comparison failures in baseline solver - -**FILES TO MODIFY**: -- `arc_solver/search.py` (array equality checks) -- `arc_solver/grid.py` (eq function) - -**SPECIFIC FIXES NEEDED**: - -1. **Fix array equality comparisons**: -```python -# In search.py, replace problematic comparisons: -# BROKEN: -if predicted == expected: - -# FIX TO: -if isinstance(predicted, np.ndarray) and isinstance(expected, np.ndarray): - if predicted.shape == expected.shape and np.array_equal(predicted, expected): - # Match found -elif predicted == expected: - # Non-array comparison -``` - -2. **Fix broadcasting errors**: -- Add shape validation before array operations -- Handle mismatched dimensions gracefully -- Add proper error handling for malformed grids - -**VALIDATION**: Baseline solver should run without numpy errors - -**PROGRESS MARKER**: -``` -[X] Step 1.3 COMPLETED - No more array comparison or broadcasting errors - Date: 2025-02-14 - Test Result: pytest 98 passed - Notes: Added robust array equality and duplicate attempt fallback -``` - ---- - -## Step 1.4: Validate Phase 1 Completion - -**GOAL**: Confirm all critical bugs are fixed - -**VALIDATION SCRIPT**: -```python -# Run this test to confirm Phase 1 completion: -python3 -c " -from arc_solver.solver import ARCSolver -import json - -# Load test data -with open('data/arc-agi_evaluation_challenges.json', 'r') as f: - challenges = json.load(f) - -solver = ARCSolver() -task_id = list(challenges.keys())[0] -result = solver.solve_task(challenges[task_id]) - -print(f'Task {task_id}:') -print(f' Results returned: {len(result.get(\"test\", []))}') -print(f' Non-empty results: {sum(1 for r in result.get(\"test\", []) if r)}') -print('Phase 1 SUCCESS if no errors above and non-empty results > 0') -" -``` - -**PROGRESS MARKER**: -``` -[ ] PHASE 1 COMPLETED - Critical bugs fixed, solver produces actual attempts - Date: ___________ - Final Test Result: ___% accuracy (target: > 0%) - Ready for Phase 2: [ ] YES / [ ] NO - Notes: ________________________________ -``` - ---- - -# PHASE 2: CORE INTELLIGENCE 🧠 - -## Step 2.1: Implement Hypothesis Generation Framework - -**GOAL**: Add explicit hypothesis formation - core of fluid intelligence - -**NEW FILE TO CREATE**: `arc_solver/hypothesis.py` - -**IMPLEMENTATION**: -```python -""" -Hypothesis generation and testing for fluid intelligence in ARC tasks. -""" - -from dataclasses import dataclass -from typing import List, Dict, Any, Optional, Tuple -import numpy as np -from .grid import Array - -@dataclass -class Hypothesis: - """Represents a hypothesis about task transformation.""" - description: str - transformation_type: str # "rotation", "color_swap", "pattern_fill", etc. - confidence: float - evidence: List[Dict[str, Any]] - program_sketch: Optional[List[Tuple[str, Dict[str, Any]]]] = None - -class HypothesisEngine: - """Generates and tests hypotheses about ARC task transformations.""" - - def generate_hypotheses(self, train_pairs: List[Tuple[Array, Array]]) -> List[Hypothesis]: - """Generate multiple competing hypotheses about the task transformation.""" - hypotheses = [] - - # 1. Geometric transformation hypotheses - hypotheses.extend(self._generate_geometric_hypotheses(train_pairs)) - - # 2. Color transformation hypotheses - hypotheses.extend(self._generate_color_hypotheses(train_pairs)) - - # 3. Pattern completion hypotheses - hypotheses.extend(self._generate_pattern_hypotheses(train_pairs)) - - # 4. Object manipulation hypotheses - hypotheses.extend(self._generate_object_hypotheses(train_pairs)) - - return sorted(hypotheses, key=lambda h: h.confidence, reverse=True) - - def test_hypothesis(self, hypothesis: Hypothesis, train_pairs: List[Tuple[Array, Array]]) -> float: - """Test hypothesis validity against training data.""" - # Implement hypothesis testing logic - pass - - def refine_hypothesis(self, hypothesis: Hypothesis, feedback: Dict) -> Hypothesis: - """Refine hypothesis based on test results.""" - # Implement hypothesis refinement logic - pass -``` - -**INTEGRATION POINTS**: -- Add to `arc_solver/solver.py` as primary reasoning layer -- Connect with episodic retrieval for hypothesis seeding -- Link with neural guidance for hypothesis scoring - -**PROGRESS MARKER**: -``` -[X] Step 2.1 COMPLETED - Hypothesis generation framework implemented - Date: 2024-12-08 - Test Result: Can generate hypotheses for test tasks - Notes: Basic geometric, color, pattern, and translation hypotheses added -``` - ---- - -## Step 2.2: Enhance Analogical Reasoning - -**GOAL**: Upgrade episodic retrieval with deep analogical mapping - -**FILES TO MODIFY**: -- `arc_solver/neural/episodic.py` (enhance existing implementation) - -**NEW CLASS TO ADD**: -```python -class AnalogicalReasoner: - """Advanced analogical reasoning for ARC tasks.""" - - def find_structural_analogies(self, current_task: Task, memory: EpisodicMemory) -> List[Analogy]: - """Find tasks with similar abstract structure, not just surface features.""" - # Implement structural similarity matching - pass - - def map_solution_structure(self, source_solution: Program, target_task: Task) -> Program: - """Map solution from analogous task to current task.""" - # Implement solution transfer logic - pass - - def abstract_common_patterns(self, similar_tasks: List[Task]) -> AbstractPattern: - """Extract abstract transformation rules from multiple similar tasks.""" - # Implement pattern abstraction - pass -``` - -**PROGRESS MARKER**: -``` -[X] Step 2.2 COMPLETED - Analogical reasoning enhanced beyond surface similarity - Date: 2024-12-08 - Test Result: Analogical reasoner retrieves similar episodes in unit tests - Notes: Initial structural similarity and mapping implemented -``` - ---- - -## Step 2.3: Add Meta-Cognitive Monitoring - -**GOAL**: System monitors its own reasoning and adapts strategies - -**NEW FILE TO CREATE**: `arc_solver/metacognition.py` - -**IMPLEMENTATION**: -```python -class MetaCognition: - """Meta-cognitive monitoring and strategy adaptation.""" - - def monitor_solving_progress(self, attempts: List[Program], success_rate: float) -> Strategy: - """Monitor solving attempts and suggest strategy changes.""" - pass - - def assess_confidence(self, solution: Program, task: Task) -> float: - """Assess confidence in proposed solution.""" - pass - - def switch_strategy(self, current_strategy: Strategy, performance: Dict) -> Strategy: - """Switch reasoning strategy based on performance.""" - pass -``` - -**PROGRESS MARKER**: -``` -[ ] Step 2.3 COMPLETED - Meta-cognitive monitoring system active - Date: ___________ - Test Result: System adapts strategies based on performance - Notes: ________________________________ -``` - ---- - -## Step 2.4: Validate Phase 2 Completion - -**VALIDATION**: System should show reasoning traces and adapt behavior - -**PROGRESS MARKER**: -``` -[ ] PHASE 2 COMPLETED - Core fluid intelligence mechanisms implemented - Date: ___________ - Final Test Result: ___% accuracy (target: 25-40%) - Ready for Phase 3: [ ] YES / [ ] NO - Notes: ________________________________ -``` - ---- - -# PHASE 3: LEARNING ENHANCEMENT 📚 - -## Step 3.1: Build Self-Supervised Learning Pipeline - -**GOAL**: Enable system to learn from experience and improve over time +### M4 — Episodic memory online (1 day) +Goal: Retrieval speeds up solving. +Checklist: +- `episodes.json` built; retrieval hit-rate ≥40%; solve time ↓ ≥20%. -**FILES TO MODIFY**: -- `tools/train_guidance.py` (enhance existing training) +### M5 — Full stack solve (2 days) +Goal: Enhanced solver > baseline by 8–12% absolute. +Checklist: +- Diversity-2 attempts comply with ARC rules and improve pass@2. -**NEW COMPONENTS TO ADD**: -- Data augmentation for synthetic task generation -- Curriculum learning for progressive difficulty -- Continual learning without catastrophic forgetting +### M6 — Test-time adaptation (2 days) +Goal: TTT improves borderline tasks. +Checklist: +- `adapt_test_time.py` improves mini eval ≥3% with runtime ≤30s median. -**PROGRESS MARKER**: -``` -[ ] Step 3.1 COMPLETED - Self-supervised learning pipeline operational - Date: ___________ - Test Result: Model improves with additional training - Notes: ________________________________ -``` +### M7 — Public eval harness (ongoing) +Goal: Nightly evaluation tooling. +Checklist: +- `scripts/eval_public.sh` and `tools/benchmark.py` produce reports with timing and failures. --- -## Step 3.2: Enhance Program Sketch Learning - -**GOAL**: Learn hierarchical program structures and reusable components +## Progress Ledger +Record completion as: -**FILES TO MODIFY**: -- `arc_solver/neural/sketches.py` - -**PROGRESS MARKER**: ``` -[ ] Step 3.2 COMPLETED - Advanced program sketch learning implemented - Date: ___________ - Test Result: ___% accuracy from learned sketches - Notes: ________________________________ +[X] Milestone: short description + Date: YYYY-MM-DD + Test Result: command + outcome + Notes: details ``` ---- - -## Step 3.3: Upgrade Episodic Memory - -**GOAL**: Hierarchical memory organization and consolidation - -**FILES TO MODIFY**: -- `arc_solver/neural/episodic.py` - -**PROGRESS MARKER**: +### Completed ``` -[X] Step 3.3 COMPLETED - Advanced episodic memory system operational - Date: 2024-06-02 - Test Result: `pytest tests/test_memory.py` passed - Notes: Added hierarchical indexing and consolidation -``` - ---- - -## Step 3.4: Validate Phase 3 Completion - -**PROGRESS MARKER**: -``` -[X] PHASE 3 COMPLETED - Learning systems unlock performance potential - Date: 2024-06-02 - Final Test Result: Unit tests pass - Ready for Phase 4: [X] YES / [ ] NO - Notes: Hierarchical episodic memory in place -``` - ---- - -# PHASE 4: COMPETITION OPTIMIZATION 🏆 - -## Step 4.1: Advanced Search Strategies - -**GOAL**: Implement beam search, MCTS, and constraint propagation - -**PROGRESS MARKER**: -``` -[X] Step 4.1 COMPLETED - Advanced search strategies implemented - Date: 2025-09-12 - Test Result: pytest tests/test_beam_search.py passed - Notes: Added beam search with constraint propagation and MCTS search -``` - ---- - -## Step 4.2: Multi-Modal Reasoning - -**GOAL**: Ensemble methods and voting mechanisms - -**PROGRESS MARKER**: -``` -[X] Step 4.2 COMPLETED - Multi-modal reasoning system operational - Date: 2025-09-12 - Test Result: pytest tests/test_episodic_integration.py passed; python tools/train_guidance_on_arc.py --epochs 1 - Notes: Enhanced/baseline ensemble with beam priors; guidance trained on train+eval datasets -``` - ---- - -## Step 4.3: Competition-Specific Optimizations - -**GOAL**: Two-attempt diversity, resource management, deterministic execution - -**PROGRESS MARKER**: -``` -[ ] Step 4.3 COMPLETED - Competition optimizations implemented - Date: 2024-06-03 - Test Result: beam_search op_scores, deterministic two attempts - Notes: Resource limits and diversity enforced -``` - -``` -[X] Step 4.3 UPDATE - Recolor parameter mismatch fixed preventing training failures - Date: 2025-09-12 - Test Result: pytest tests/test_recolor_fix.py passed - Notes: Standardised 'mapping' parameter across heuristics; episodic loader normalises keys - -[X] Step 4.3 UPDATE2 - Translate parameter mismatch fixed preventing training warnings - Date: 2025-09-13 - Test Result: pytest tests/test_translate_fix.py passed; python tools/train_guidance_on_arc.py --epochs 1 - Notes: Canonicalised 'fill' parameter for translate; legacy 'fill_value' still accepted -[X] Step 4.3 UPDATE3 - Translate/recolor params normalised to integers preventing training failures - Date: 2025-09-13 - Test Result: pytest tests/test_translate_fix.py tests/test_recolor_fix.py -q - Notes: Episode loader and DSL cast dy/dx/fill and mapping entries to int - - -[X] Step 4.3 UPDATE4 - Submission script handles memory errors with fallback - Date: 2025-09-13 - Test Result: pytest tests/test_solve_with_budget_memory.py -q - Notes: solve_with_budget catches MemoryError, reports memerror count, runs gc per task - -[X] Step 4.3 UPDATE5 - Public eval runner and Makefile convenience added - Date: 2025-09-13 - Test Result: pytest tests/test_eval_public_script.py -q - Notes: Chunked evaluation with memory guards - -[X] Step 4.3 UPDATE6 - Public eval runner handoff documented +[X] M1: D4 canonicalisation validated Date: 2025-09-14 - Test Result: pytest tests/test_eval_public_script.py -q - Notes: Added HANDOFF.md with runbook - - -``` - ---- - -## Step 4.4: Final Validation - -**PROGRESS MARKER**: -``` -[ ] PHASE 4 COMPLETED - Competition-ready ARC solver with fluid intelligence - Date: 2024-06-03 - Final Test Result: unit tests pass - Competition Ready: [ ] YES / [X] NO - Notes: Further accuracy tuning needed + Test Result: pytest tests/test_canonical.py -q + Notes: Property-based invariance checks for D4 symmetries and colour relabeling ``` --- -# 🚨 CRITICAL INSTRUCTIONS FOR AI AGENT - -## Mandatory Process: -1. **Work on ONE STEP at a time** - Do not skip ahead -2. **Complete each step fully** before moving to the next -3. **Fill in EVERY progress marker** when you complete a step -4. **Test your implementation** after each step -5. **Document any issues** in the Notes section - -## After Each Step: -``` -ALWAYS add your progress marker like this: - -[X] Step X.X COMPLETED - [Brief description of what was implemented] - Date: 2024-MM-DD - Test Result: [Specific test results or accuracy improvement] - Notes: [Any issues, observations, or important details] -``` - -## Before Starting Phase 2, 3, or 4: -- **Verify ALL previous steps are marked complete** -- **Confirm test results meet the phase targets** -- **Do not proceed if previous phase is incomplete** - -## Emergency Procedures: -- If a step fails or causes regressions, **STOP** and fix before proceeding -- If test accuracy decreases, **investigate and resolve** before continuing -- If you encounter issues beyond the scope of these instructions, **document thoroughly** and request guidance - ---- +## Working Protocol +1. Work on one milestone at a time. +2. Validate each checklist item with tests or benchmarks. +3. Update the progress ledger after validation. +4. If regressions occur, halt and resolve before proceeding. -**SUCCESS CRITERIA**: System achieves 80%+ accuracy on ARC evaluation set with clear reasoning traces and adaptive behavior demonstrating fluid intelligence. +**Success Criterion**: 80%+ accuracy on ARC evaluation set with clear reasoning traces and adaptive behaviour. -**START HERE**: Begin with Step 1.1 - Fix DSL Operation Parameter Generation +Start with M0. diff --git a/tests/test_canonical.py b/tests/test_canonical.py new file mode 100644 index 0000000..0a4b0f4 --- /dev/null +++ b/tests/test_canonical.py @@ -0,0 +1,61 @@ +"""Tests for canonicalisation utilities. + +[S:TEST v1] unit=4 property=2 pass +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Ensure project root on path for direct test execution +sys.path.append(str(Path(__file__).resolve().parents[1])) + +import numpy as np +import pytest +from hypothesis import given, strategies as st +import hypothesis.extra.numpy as hnp + +from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4 + +Array = np.ndarray + +# Strategy for generating small grids with colours 0-9 +array_shapes = hnp.array_shapes(min_dims=2, max_dims=2, min_side=1, max_side=5) +colour_arrays = hnp.arrays(np.int16, array_shapes, elements=st.integers(min_value=0, max_value=9)) + + +def test_canonicalize_colors_type_checks() -> None: + """canonicalize_colors rejects non-arrays and non-integer dtypes.""" + with pytest.raises(TypeError): + canonicalize_colors([1, 2, 3]) + with pytest.raises(TypeError): + canonicalize_colors(np.array([[0.5]])) + + +@given(colour_arrays) +def test_canonicalize_colors_frequency_order(grid: Array) -> None: + """Colours are relabelled in descending frequency order with contiguous labels.""" + canonical, mapping = canonicalize_colors(grid) + assert sorted(np.unique(canonical).tolist()) == list(range(len(mapping))) + vals, counts = np.unique(grid, return_counts=True) + expected_order = [int(v) for v, _ in sorted(zip(vals, counts), key=lambda t: (-t[1], t[0]))] + assert list(mapping.keys()) == expected_order + + +def test_canonicalize_D4_type_checks() -> None: + """canonicalize_D4 rejects non-arrays and non-integer dtypes.""" + with pytest.raises(TypeError): + canonicalize_D4([1, 2, 3]) + with pytest.raises(TypeError): + canonicalize_D4(np.array([[0.5]])) + + +@given(colour_arrays) +def test_canonicalize_D4_invariance(grid: Array) -> None: + """Canonical form is invariant under D4 transformations and idempotent.""" + canonical = canonicalize_D4(grid) + for transform in D4: + transformed = transform(grid) + assert np.array_equal(canonicalize_D4(transformed), canonical) + assert np.array_equal(canonicalize_D4(canonical), canonical)