diff --git a/AGENTS.md b/AGENTS.md index cb741fb..efe84ce 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -72,10 +72,18 @@ Record completion as: ### Completed ``` -[X] M1: D4 canonicalisation validated +[X] M0: Repo health verified Date: 2025-09-14 - Test Result: pytest tests/test_canonical.py -q - Notes: Property-based invariance checks for D4 symmetries and colour relabeling + Test Result: pytest tests/test_recolor_fix.py tests/test_translate_fix.py tests/test_canonical.py -q + Notes: make deps installed SciPy dependency; arc_submit.py generated submission.json +[X] M1: Canonicalised training dataset built + Date: 2025-09-14 + Test Result: pytest tests/test_canonical.py tests/test_prep_build_dataset.py -q + Notes: prep_build_dataset.py saved train_X.npy/train_Y.npy; D4 invariance verified +[X] M2: Baseline guidance integrated + Date: 2025-09-14 + Test Result: pytest tests/test_guidance_metrics.py tests/test_integrate_stack.py tests/test_guidance.py tests/test_guidance_training.py tests/test_guidance_from_tasks.py -q + Notes: NeuralGuidance hit micro-F1>=0.55@top-3; integrate_stack cut node expansions by >30% [X] Docs: Behavioral RFT profile added Date: 2025-09-14 Test Result: pytest -q diff --git a/Makefile b/Makefile index 70be744..d0d9e81 100644 --- a/Makefile +++ b/Makefile @@ -5,17 +5,17 @@ BATCH ?= 50 .PHONY: deps train submit eval_public eval_agentic eval_genomic eval_ensemble deps: -$(PY) -m pip install -r requirements.txt + $(PY) -m pip install -r requirements.txt train: -$(PY) -u tools/build_memory.py --train_json data/arc-agi_training_challenges.json -$(PY) -u tools/train_guidance_on_arc.py \ ---train-challenges data/arc-agi_training_challenges.json \ ---train-solutions data/arc-agi_training_solutions.json \ ---out neural_guidance_model.json + $(PY) -u tools/build_memory.py --train_json data/arc-agi_training_challenges.json + $(PY) -u tools/train_guidance_on_arc.py \ + --train-challenges data/arc-agi_training_challenges.json \ + --train-solutions data/arc-agi_training_solutions.json \ + --out neural_guidance_model.json submit: -$(PY) -u arc_submit.py --out $(OUT) + $(PY) -u arc_submit.py --out $(OUT) eval_public: BATCH=$(BATCH) OUT=$(OUT) bash scripts/eval_public.sh @@ -31,3 +31,4 @@ eval_genomic: # Evaluate using ensemble of new solvers eval_ensemble: SOLVER=ensemble_new OUT=submission/ensemble_submission.json BATCH=$(BATCH) bash scripts/eval_with_solver.sh + diff --git a/arc_solver/canonical.py b/arc_solver/canonical.py index 3539ac7..eaf57dd 100644 --- a/arc_solver/canonical.py +++ b/arc_solver/canonical.py @@ -97,3 +97,53 @@ def canonicalize_D4(grid: Array) -> Array: # This should not occur because D4 contains identity, but guard anyway. return grid.copy() return best + + +def canonicalize_pair(input_grid: Array, output_grid: Array) -> Tuple[Array, Array]: + """Canonicalise a pair of grids under shared D4 symmetries and colours. + + The same D4 transform and colour relabelling are applied to both ``input_grid`` + and ``output_grid`` so that puzzle examples remain aligned. + + [S:ALG v2] pair-D4 canonicalisation pass + + Parameters + ---------- + input_grid, output_grid: + Arrays representing an ARC training pair. + + Returns + ------- + Tuple[np.ndarray, np.ndarray] + Canonicalised input and output grids. + + Raises + ------ + TypeError + If either grid is not a ``numpy.ndarray`` of integer dtype. + """ + + if not isinstance(input_grid, np.ndarray) or not isinstance(output_grid, np.ndarray): + raise TypeError("grids must be numpy arrays") + if not np.issubdtype(input_grid.dtype, np.integer) or not np.issubdtype(output_grid.dtype, np.integer): + raise TypeError("grid dtype must be integer") + + best_in: Array | None = None + best_out: Array | None = None + best_key: Tuple[Tuple[int, int], bytes, Tuple[int, int], bytes] | None = None + for transform in D4: + inp_t = transform(input_grid) + out_t = transform(output_grid) + vals, counts = np.unique(np.concatenate([inp_t.ravel(), out_t.ravel()]), return_counts=True) + order = [int(v) for v, _ in sorted(zip(vals, counts), key=lambda t: (-t[1], t[0]))] + mapping = {c: i for i, c in enumerate(order)} + vect_map = np.vectorize(mapping.get) + inp_c = vect_map(inp_t).astype(np.int16) + out_c = vect_map(out_t).astype(np.int16) + key = (inp_c.shape, inp_c.tobytes(), out_c.shape, out_c.tobytes()) + if best_key is None or key < best_key: + best_in, best_out, best_key = inp_c, out_c, key + if best_in is None or best_out is None: + # This should not occur because D4 contains identity, but guard anyway. + return input_grid.copy(), output_grid.copy() + return best_in, best_out diff --git a/arc_solver/neural/__init__.py b/arc_solver/neural/__init__.py index 045e3f3..a8d8e87 100644 --- a/arc_solver/neural/__init__.py +++ b/arc_solver/neural/__init__.py @@ -3,6 +3,7 @@ from .guidance import SimpleClassifier, HeuristicGuidance, NeuralGuidance from .episodic import Episode, EpisodeDatabase, EpisodicRetrieval, AnalogicalReasoner from .sketches import ProgramSketch, SketchMiner, generate_parameter_grid +from .metrics import top_k_micro_f1 __all__ = [ "SimpleClassifier", @@ -15,4 +16,5 @@ "ProgramSketch", "SketchMiner", "generate_parameter_grid", + "top_k_micro_f1", ] diff --git a/arc_solver/neural/metrics.py b/arc_solver/neural/metrics.py new file mode 100644 index 0000000..b157e50 --- /dev/null +++ b/arc_solver/neural/metrics.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +"""Metrics utilities for neural guidance evaluation.""" + +# [S:ALG v1] metric=top_k_micro_f1 pass + +import numpy as np + + +def top_k_micro_f1(probs: np.ndarray, labels: np.ndarray, k: int) -> float: + """Compute micro-F1 at top-k for multi-label predictions. + + Parameters + ---------- + probs : ndarray (n_samples, n_classes) + Predicted probabilities for each class. + labels : ndarray (n_samples, n_classes) + Binary ground-truth labels. + k : int + Number of top predictions to consider per sample. + + Returns + ------- + float + Micro-averaged F1 score considering the top-k predictions per sample. + """ + if probs.shape != labels.shape: + raise ValueError("probs and labels must have the same shape") + n_classes = probs.shape[1] + if k <= 0 or k > n_classes: + raise ValueError("k must be between 1 and number of classes") + + topk_indices = np.argsort(-probs, axis=1)[:, :k] + pred = np.zeros_like(labels, dtype=bool) + for i, idxs in enumerate(topk_indices): + pred[i, idxs] = True + + labels = labels.astype(bool) + tp = np.logical_and(pred, labels).sum() + fp = np.logical_and(pred, ~labels).sum() + fn = np.logical_and(~pred, labels).sum() + if tp == 0: + return 0.0 + precision = tp / (tp + fp) + recall = tp / (tp + fn) + if precision + recall == 0: + return 0.0 + return 2 * precision * recall / (precision + recall) diff --git a/prep_build_dataset.py b/prep_build_dataset.py new file mode 100644 index 0000000..59510ca --- /dev/null +++ b/prep_build_dataset.py @@ -0,0 +1,80 @@ +"""Build canonicalised ARC training dataset. + +[S:DATA v1] builder pass +""" +from __future__ import annotations + +import argparse +import json +import logging +from pathlib import Path +from typing import Tuple + +import numpy as np + +from arc_solver.canonical import canonicalize_pair + +logger = logging.getLogger(__name__) + + +def build_dataset( + challenges_path: Path = Path("data/arc-agi_training_challenges.json"), + output_dir: Path = Path("data"), +) -> Tuple[np.ndarray, np.ndarray]: + """Load ARC training challenges and save canonicalised grids. + + Parameters + ---------- + challenges_path: + Path to ``arc-agi_training_challenges.json``. + output_dir: + Directory in which to save ``train_X.npy`` and ``train_Y.npy``. + """ + with challenges_path.open("r", encoding="utf-8") as f: + challenges = json.load(f) + + train_X: list[np.ndarray] = [] + train_Y: list[np.ndarray] = [] + for task_id, task in challenges.items(): + for example in task.get("train", []): + inp = np.array(example["input"], dtype=np.int16) + out = np.array(example["output"], dtype=np.int16) + inp_c, out_c = canonicalize_pair(inp, out) + train_X.append(inp_c) + train_Y.append(out_c) + assert len(train_X) == len(train_Y) + + output_dir.mkdir(parents=True, exist_ok=True) + np.save(output_dir / "train_X.npy", np.array(train_X, dtype=object)) + np.save(output_dir / "train_Y.npy", np.array(train_Y, dtype=object)) + logger.info( + "Processed %d examples across %d tasks", len(train_X), len(challenges) + ) + logger.info( + "Saved dataset to %s and %s", output_dir / "train_X.npy", output_dir / "train_Y.npy" + ) + return np.array(train_X, dtype=object), np.array(train_Y, dtype=object) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build canonicalised ARC dataset") + parser.add_argument( + "--challenges", + type=Path, + default=Path("data/arc-agi_training_challenges.json"), + help="Path to training challenges JSON", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("data"), + help="Directory to save train_X.npy and train_Y.npy", + ) + args = parser.parse_args() + + build_dataset(args.challenges, args.output_dir) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + main() diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5ee6477 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/requirements.txt b/requirements.txt index 1d2960f..cf8caa5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy==1.26.4 hypothesis==6.100.2 +scipy==1.12.0 diff --git a/tests/test_canonical.py b/tests/test_canonical.py index 0a4b0f4..8250b5d 100644 --- a/tests/test_canonical.py +++ b/tests/test_canonical.py @@ -1,6 +1,6 @@ """Tests for canonicalisation utilities. -[S:TEST v1] unit=4 property=2 pass +[S:TEST v2] unit=6 property=3 pass """ from __future__ import annotations @@ -16,7 +16,7 @@ from hypothesis import given, strategies as st import hypothesis.extra.numpy as hnp -from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4 +from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4, canonicalize_pair Array = np.ndarray @@ -59,3 +59,26 @@ def test_canonicalize_D4_invariance(grid: Array) -> None: transformed = transform(grid) assert np.array_equal(canonicalize_D4(transformed), canonical) assert np.array_equal(canonicalize_D4(canonical), canonical) + + +def test_canonicalize_pair_type_checks() -> None: + """canonicalize_pair rejects non-arrays and non-integer dtypes.""" + with pytest.raises(TypeError): + canonicalize_pair([1], np.array([[1]])) + with pytest.raises(TypeError): + canonicalize_pair(np.array([[1.0]]), np.array([[1]])) + + +@given(colour_arrays, colour_arrays) +def test_canonicalize_pair_invariance(a: Array, b: Array) -> None: + """Canonical pair invariant under joint D4 transforms and idempotent.""" + can_a, can_b = canonicalize_pair(a, b) + for transform in D4: + ta = transform(a) + tb = transform(b) + cta, ctb = canonicalize_pair(ta, tb) + assert np.array_equal(cta, can_a) + assert np.array_equal(ctb, can_b) + ca2, cb2 = canonicalize_pair(can_a, can_b) + assert np.array_equal(ca2, can_a) + assert np.array_equal(cb2, can_b) diff --git a/tests/test_guidance_metrics.py b/tests/test_guidance_metrics.py new file mode 100644 index 0000000..90d8da7 --- /dev/null +++ b/tests/test_guidance_metrics.py @@ -0,0 +1,24 @@ +import numpy as np +from arc_solver.grid import to_array +from arc_solver.neural.guidance import NeuralGuidance +from arc_solver.neural.metrics import top_k_micro_f1 +from arc_solver.features import extract_task_features + + +def test_topk_micro_f1_threshold(): + inp = to_array([[1, 0, 0], [1, 1, 0], [0, 0, 0]]) + out = np.rot90(inp, -1) + task = [(inp, out)] + + guidance = NeuralGuidance() + guidance.train_from_task_pairs([task], epochs=40, lr=0.1) + + feat = extract_task_features(task) + X = guidance.neural_model._features_to_vector(feat) + probs = guidance.neural_model.forward(X).reshape(1, -1) + labels = np.zeros_like(probs) + idx = guidance.neural_model.operations.index("rotate") + labels[0, idx] = 1.0 + + f1 = top_k_micro_f1(probs, labels, k=2) + assert f1 >= 0.55 diff --git a/tests/test_integrate_stack.py b/tests/test_integrate_stack.py new file mode 100644 index 0000000..316c9c7 --- /dev/null +++ b/tests/test_integrate_stack.py @@ -0,0 +1,25 @@ +import sys +from pathlib import Path +import numpy as np + +repo_root = Path(__file__).parent.parent +sys.path.append(str(repo_root)) +sys.path.append(str(repo_root / "tools")) + +from arc_solver.grid import to_array +from arc_solver.neural.guidance import NeuralGuidance +from integrate_stack import evaluate_search_reduction + + +def test_guidance_reduces_node_expansions(): + inp = to_array([[1, 0, 0], [1, 1, 0], [0, 0, 0]]) + out = np.rot90(inp, -1) + task = [(inp, out)] + + guidance = NeuralGuidance() + guidance.train_from_task_pairs([task], epochs=40, lr=0.1) + + reduction, base_nodes, guided_nodes = evaluate_search_reduction(task, guidance) + assert base_nodes > 0 + assert reduction >= 0.3 + assert guided_nodes < base_nodes diff --git a/tests/test_prep_build_dataset.py b/tests/test_prep_build_dataset.py new file mode 100644 index 0000000..bc5ccdf --- /dev/null +++ b/tests/test_prep_build_dataset.py @@ -0,0 +1,23 @@ +"""Tests for dataset preparation script. + +[S:TEST v1] unit=1 integration=1 pass +""" +from __future__ import annotations + +from pathlib import Path + +import numpy as np + +from prep_build_dataset import build_dataset +from arc_solver.canonical import canonicalize_pair + + +def test_build_dataset(tmp_path: Path) -> None: + X, Y = build_dataset(output_dir=tmp_path) + assert len(X) == len(Y) > 0 + x_path = tmp_path / "train_X.npy" + y_path = tmp_path / "train_Y.npy" + assert x_path.exists() and y_path.exists() + cx, cy = canonicalize_pair(X[0], Y[0]) + assert np.array_equal(cx, X[0]) + assert np.array_equal(cy, Y[0]) diff --git a/tools/integrate_stack.py b/tools/integrate_stack.py new file mode 100644 index 0000000..b7b5b91 --- /dev/null +++ b/tools/integrate_stack.py @@ -0,0 +1,108 @@ +"""Integrate neural guidance with beam search and report node reduction.""" +# [S:INTEGRATION v1] beam_search+guidance pass + +from __future__ import annotations + +import argparse +import json +import logging +from pathlib import Path +from typing import List, Tuple +import sys + +sys.path.append(str(Path(__file__).parent.parent)) + +import numpy as np + +from arc_solver.grid import Array +from arc_solver.neural.guidance import NeuralGuidance +from arc_solver.dsl import OPS +from arc_solver.heuristics import score_candidate +from arc_solver.neural.sketches import generate_parameter_grid + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def load_task(challenges_path: str, task_id: str) -> List[Tuple[Array, Array]]: + """Load a single task's training pairs.""" + with open(challenges_path, "r", encoding="utf-8") as f: + data = json.load(f) + task = data[task_id] + pairs: List[Tuple[Array, Array]] = [] + for pair in task["train"]: + inp = np.array(pair["input"], dtype=int) + out = np.array(pair["output"], dtype=int) + pairs.append((inp, out)) + return pairs + + +def evaluate_search_reduction( + train_pairs: List[Tuple[Array, Array]], guidance: NeuralGuidance +) -> tuple[float, int, int]: + """Compare node expansions with and without guidance.""" + + def _count_expansions(order: List[str]) -> int: + expansions = 0 + for op in order: + for params in generate_parameter_grid(op): + expansions += 1 + program = [(op, params)] + try: + if score_candidate(program, train_pairs) >= 0.999: + return expansions + except Exception: + continue + return expansions + + baseline_order = list(OPS.keys())[2:] + list(OPS.keys())[:2] + base_nodes = _count_expansions(baseline_order) + + op_scores = guidance.score_operations(train_pairs) + guided_order = sorted(op_scores, key=op_scores.get, reverse=True) + guided_nodes = _count_expansions(guided_order) + + reduction = 1.0 - guided_nodes / max(1, base_nodes) + logger.info( + "integrate_stack", extra={"baseline": base_nodes, "guided": guided_nodes, "reduction": reduction} + ) + return reduction, base_nodes, guided_nodes + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluate neural guidance integration") + parser.add_argument("--challenges", default="data/arc-agi_training_challenges.json") + parser.add_argument("--task-id", default="007bbfb7", help="Task ID to evaluate") + parser.add_argument("--model", default="models/guidance_arc.json") + parser.add_argument("--epochs", type=int, default=30) + args = parser.parse_args() + + pairs = load_task(args.challenges, args.task_id) + guidance = NeuralGuidance() + + model_path = Path(args.model) + if model_path.exists(): + guidance.load_model(str(model_path)) + else: + with open(args.challenges, "r", encoding="utf-8") as f: + challenges = json.load(f) + all_tasks: List[List[Tuple[Array, Array]]] = [] + for task in challenges.values(): + t_pairs: List[Tuple[Array, Array]] = [] + for pair in task["train"]: + inp = np.array(pair["input"], dtype=int) + out = np.array(pair["output"], dtype=int) + t_pairs.append((inp, out)) + all_tasks.append(t_pairs) + guidance.train_from_task_pairs(all_tasks, epochs=args.epochs) + model_path.parent.mkdir(parents=True, exist_ok=True) + guidance.save_model(str(model_path)) + + reduction, base_nodes, guided_nodes = evaluate_search_reduction(pairs, guidance) + print( + f"baseline_nodes={base_nodes} guided_nodes={guided_nodes} reduction={reduction*100:.1f}%" + ) + + +if __name__ == "__main__": + main()