Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,18 @@ Record completion as:

### Completed
```
[X] M1: D4 canonicalisation validated
[X] M0: Repo health verified
Date: 2025-09-14
Test Result: pytest tests/test_canonical.py -q
Notes: Property-based invariance checks for D4 symmetries and colour relabeling
Test Result: pytest tests/test_recolor_fix.py tests/test_translate_fix.py tests/test_canonical.py -q
Notes: make deps installed SciPy dependency; arc_submit.py generated submission.json
[X] M1: Canonicalised training dataset built
Date: 2025-09-14
Test Result: pytest tests/test_canonical.py tests/test_prep_build_dataset.py -q
Notes: prep_build_dataset.py saved train_X.npy/train_Y.npy; D4 invariance verified
[X] M2: Baseline guidance integrated
Date: 2025-09-14
Test Result: pytest tests/test_guidance_metrics.py tests/test_integrate_stack.py tests/test_guidance.py tests/test_guidance_training.py tests/test_guidance_from_tasks.py -q
Notes: NeuralGuidance hit micro-F1>=0.55@top-3; integrate_stack cut node expansions by >30%
[X] Docs: Behavioral RFT profile added
Date: 2025-09-14
Test Result: pytest -q
Expand Down
15 changes: 8 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ BATCH ?= 50
.PHONY: deps train submit eval_public eval_agentic eval_genomic eval_ensemble

deps:
$(PY) -m pip install -r requirements.txt
$(PY) -m pip install -r requirements.txt

train:
$(PY) -u tools/build_memory.py --train_json data/arc-agi_training_challenges.json
$(PY) -u tools/train_guidance_on_arc.py \
--train-challenges data/arc-agi_training_challenges.json \
--train-solutions data/arc-agi_training_solutions.json \
--out neural_guidance_model.json
$(PY) -u tools/build_memory.py --train_json data/arc-agi_training_challenges.json
$(PY) -u tools/train_guidance_on_arc.py \
--train-challenges data/arc-agi_training_challenges.json \
--train-solutions data/arc-agi_training_solutions.json \
--out neural_guidance_model.json

submit:
$(PY) -u arc_submit.py --out $(OUT)
$(PY) -u arc_submit.py --out $(OUT)

eval_public:
BATCH=$(BATCH) OUT=$(OUT) bash scripts/eval_public.sh
Expand All @@ -31,3 +31,4 @@ eval_genomic:
# Evaluate using ensemble of new solvers
eval_ensemble:
SOLVER=ensemble_new OUT=submission/ensemble_submission.json BATCH=$(BATCH) bash scripts/eval_with_solver.sh

50 changes: 50 additions & 0 deletions arc_solver/canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,53 @@ def canonicalize_D4(grid: Array) -> Array:
# This should not occur because D4 contains identity, but guard anyway.
return grid.copy()
return best


def canonicalize_pair(input_grid: Array, output_grid: Array) -> Tuple[Array, Array]:
"""Canonicalise a pair of grids under shared D4 symmetries and colours.

The same D4 transform and colour relabelling are applied to both ``input_grid``
and ``output_grid`` so that puzzle examples remain aligned.

[S:ALG v2] pair-D4 canonicalisation pass

Parameters
----------
input_grid, output_grid:
Arrays representing an ARC training pair.

Returns
-------
Tuple[np.ndarray, np.ndarray]
Canonicalised input and output grids.

Raises
------
TypeError
If either grid is not a ``numpy.ndarray`` of integer dtype.
"""

if not isinstance(input_grid, np.ndarray) or not isinstance(output_grid, np.ndarray):
raise TypeError("grids must be numpy arrays")
if not np.issubdtype(input_grid.dtype, np.integer) or not np.issubdtype(output_grid.dtype, np.integer):
raise TypeError("grid dtype must be integer")

best_in: Array | None = None
best_out: Array | None = None
best_key: Tuple[Tuple[int, int], bytes, Tuple[int, int], bytes] | None = None
for transform in D4:
inp_t = transform(input_grid)
out_t = transform(output_grid)
vals, counts = np.unique(np.concatenate([inp_t.ravel(), out_t.ravel()]), return_counts=True)
order = [int(v) for v, _ in sorted(zip(vals, counts), key=lambda t: (-t[1], t[0]))]
mapping = {c: i for i, c in enumerate(order)}
vect_map = np.vectorize(mapping.get)
inp_c = vect_map(inp_t).astype(np.int16)
out_c = vect_map(out_t).astype(np.int16)
key = (inp_c.shape, inp_c.tobytes(), out_c.shape, out_c.tobytes())
if best_key is None or key < best_key:
best_in, best_out, best_key = inp_c, out_c, key
if best_in is None or best_out is None:
# This should not occur because D4 contains identity, but guard anyway.
return input_grid.copy(), output_grid.copy()
return best_in, best_out
2 changes: 2 additions & 0 deletions arc_solver/neural/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .guidance import SimpleClassifier, HeuristicGuidance, NeuralGuidance
from .episodic import Episode, EpisodeDatabase, EpisodicRetrieval, AnalogicalReasoner
from .sketches import ProgramSketch, SketchMiner, generate_parameter_grid
from .metrics import top_k_micro_f1

__all__ = [
"SimpleClassifier",
Expand All @@ -15,4 +16,5 @@
"ProgramSketch",
"SketchMiner",
"generate_parameter_grid",
"top_k_micro_f1",
]
48 changes: 48 additions & 0 deletions arc_solver/neural/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from __future__ import annotations

"""Metrics utilities for neural guidance evaluation."""

# [S:ALG v1] metric=top_k_micro_f1 pass

import numpy as np


def top_k_micro_f1(probs: np.ndarray, labels: np.ndarray, k: int) -> float:
"""Compute micro-F1 at top-k for multi-label predictions.

Parameters
----------
probs : ndarray (n_samples, n_classes)
Predicted probabilities for each class.
labels : ndarray (n_samples, n_classes)
Binary ground-truth labels.
k : int
Number of top predictions to consider per sample.

Returns
-------
float
Micro-averaged F1 score considering the top-k predictions per sample.
"""
if probs.shape != labels.shape:
raise ValueError("probs and labels must have the same shape")
n_classes = probs.shape[1]
if k <= 0 or k > n_classes:
raise ValueError("k must be between 1 and number of classes")

topk_indices = np.argsort(-probs, axis=1)[:, :k]
pred = np.zeros_like(labels, dtype=bool)
for i, idxs in enumerate(topk_indices):
pred[i, idxs] = True

Comment on lines +33 to +37
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Use argpartition and vectorized assignment (faster, no Python loop).

Improves performance for large class counts.

Apply this diff:

-    topk_indices = np.argsort(-probs, axis=1)[:, :k]
-    pred = np.zeros_like(labels, dtype=bool)
-    for i, idxs in enumerate(topk_indices):
-        pred[i, idxs] = True
+    topk_indices = np.argpartition(-probs, kth=k-1, axis=1)[:, :k]
+    pred = np.zeros_like(labels, dtype=bool)
+    rows = np.arange(probs.shape[0])[:, None]
+    pred[rows, topk_indices] = True
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
topk_indices = np.argsort(-probs, axis=1)[:, :k]
pred = np.zeros_like(labels, dtype=bool)
for i, idxs in enumerate(topk_indices):
pred[i, idxs] = True
topk_indices = np.argpartition(-probs, kth=k-1, axis=1)[:, :k]
pred = np.zeros_like(labels, dtype=bool)
rows = np.arange(probs.shape[0])[:, None]
pred[rows, topk_indices] = True
🤖 Prompt for AI Agents
In arc_solver/neural/metrics.py around lines 33 to 37, the current code uses
np.argsort and a Python loop to set top-k predictions which is slow for many
classes; replace it by using np.argpartition(-probs, kth=k-1, axis=1)[:, :k] to
get the top-k indices in O(n) time, then create pred = np.zeros_like(labels,
dtype=bool) and set the entries to True with a vectorized advanced-index
assignment using row indices (np.arange(probs.shape[0])[:, None]) and the top-k
index array so no Python loop is needed. Ensure k <= number of classes and keep
dtype=bool for pred.

labels = labels.astype(bool)
tp = np.logical_and(pred, labels).sum()
fp = np.logical_and(pred, ~labels).sum()
fn = np.logical_and(~pred, labels).sum()
if tp == 0:
return 0.0
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
80 changes: 80 additions & 0 deletions prep_build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Build canonicalised ARC training dataset.

[S:DATA v1] builder pass
"""
from __future__ import annotations

import argparse
import json
import logging
from pathlib import Path
from typing import Tuple

import numpy as np

from arc_solver.canonical import canonicalize_pair

logger = logging.getLogger(__name__)


def build_dataset(
challenges_path: Path = Path("data/arc-agi_training_challenges.json"),
output_dir: Path = Path("data"),
) -> Tuple[np.ndarray, np.ndarray]:
"""Load ARC training challenges and save canonicalised grids.

Parameters
----------
challenges_path:
Path to ``arc-agi_training_challenges.json``.
output_dir:
Directory in which to save ``train_X.npy`` and ``train_Y.npy``.
"""
with challenges_path.open("r", encoding="utf-8") as f:
challenges = json.load(f)

train_X: list[np.ndarray] = []
train_Y: list[np.ndarray] = []
for task_id, task in challenges.items():
for example in task.get("train", []):
inp = np.array(example["input"], dtype=np.int16)
out = np.array(example["output"], dtype=np.int16)
inp_c, out_c = canonicalize_pair(inp, out)
train_X.append(inp_c)
train_Y.append(out_c)
assert len(train_X) == len(train_Y)

output_dir.mkdir(parents=True, exist_ok=True)
np.save(output_dir / "train_X.npy", np.array(train_X, dtype=object))
np.save(output_dir / "train_Y.npy", np.array(train_Y, dtype=object))
logger.info(
"Processed %d examples across %d tasks", len(train_X), len(challenges)
)
logger.info(
"Saved dataset to %s and %s", output_dir / "train_X.npy", output_dir / "train_Y.npy"
)
return np.array(train_X, dtype=object), np.array(train_Y, dtype=object)


def main() -> None:
parser = argparse.ArgumentParser(description="Build canonicalised ARC dataset")
parser.add_argument(
"--challenges",
type=Path,
default=Path("data/arc-agi_training_challenges.json"),
help="Path to training challenges JSON",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("data"),
help="Directory to save train_X.npy and train_Y.npy",
)
args = parser.parse_args()

build_dataset(args.challenges, args.output_dir)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
main()
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
testpaths = tests
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
numpy==1.26.4
hypothesis==6.100.2
scipy==1.12.0
27 changes: 25 additions & 2 deletions tests/test_canonical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Tests for canonicalisation utilities.

[S:TEST v1] unit=4 property=2 pass
[S:TEST v2] unit=6 property=3 pass
"""

from __future__ import annotations
Expand All @@ -16,7 +16,7 @@
from hypothesis import given, strategies as st
import hypothesis.extra.numpy as hnp

from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4
from arc_solver.canonical import D4, canonicalize_colors, canonicalize_D4, canonicalize_pair

Array = np.ndarray

Expand Down Expand Up @@ -59,3 +59,26 @@ def test_canonicalize_D4_invariance(grid: Array) -> None:
transformed = transform(grid)
assert np.array_equal(canonicalize_D4(transformed), canonical)
assert np.array_equal(canonicalize_D4(canonical), canonical)


def test_canonicalize_pair_type_checks() -> None:
"""canonicalize_pair rejects non-arrays and non-integer dtypes."""
with pytest.raises(TypeError):
canonicalize_pair([1], np.array([[1]]))
with pytest.raises(TypeError):
canonicalize_pair(np.array([[1.0]]), np.array([[1]]))


@given(colour_arrays, colour_arrays)
def test_canonicalize_pair_invariance(a: Array, b: Array) -> None:
"""Canonical pair invariant under joint D4 transforms and idempotent."""
can_a, can_b = canonicalize_pair(a, b)
for transform in D4:
ta = transform(a)
tb = transform(b)
cta, ctb = canonicalize_pair(ta, tb)
assert np.array_equal(cta, can_a)
assert np.array_equal(ctb, can_b)
ca2, cb2 = canonicalize_pair(can_a, can_b)
assert np.array_equal(ca2, can_a)
assert np.array_equal(cb2, can_b)
24 changes: 24 additions & 0 deletions tests/test_guidance_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np
from arc_solver.grid import to_array
from arc_solver.neural.guidance import NeuralGuidance
from arc_solver.neural.metrics import top_k_micro_f1
from arc_solver.features import extract_task_features


def test_topk_micro_f1_threshold():
inp = to_array([[1, 0, 0], [1, 1, 0], [0, 0, 0]])
out = np.rot90(inp, -1)
task = [(inp, out)]

guidance = NeuralGuidance()
guidance.train_from_task_pairs([task], epochs=40, lr=0.1)

feat = extract_task_features(task)
X = guidance.neural_model._features_to_vector(feat)
probs = guidance.neural_model.forward(X).reshape(1, -1)
labels = np.zeros_like(probs)
idx = guidance.neural_model.operations.index("rotate")
labels[0, idx] = 1.0

f1 = top_k_micro_f1(probs, labels, k=2)
assert f1 >= 0.55
25 changes: 25 additions & 0 deletions tests/test_integrate_stack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys
from pathlib import Path
import numpy as np

repo_root = Path(__file__).parent.parent
sys.path.append(str(repo_root))
sys.path.append(str(repo_root / "tools"))

from arc_solver.grid import to_array
from arc_solver.neural.guidance import NeuralGuidance
from integrate_stack import evaluate_search_reduction


def test_guidance_reduces_node_expansions():
inp = to_array([[1, 0, 0], [1, 1, 0], [0, 0, 0]])
out = np.rot90(inp, -1)
task = [(inp, out)]

guidance = NeuralGuidance()
guidance.train_from_task_pairs([task], epochs=40, lr=0.1)

reduction, base_nodes, guided_nodes = evaluate_search_reduction(task, guidance)
assert base_nodes > 0
assert reduction >= 0.3
assert guided_nodes < base_nodes
23 changes: 23 additions & 0 deletions tests/test_prep_build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Tests for dataset preparation script.

[S:TEST v1] unit=1 integration=1 pass
"""
from __future__ import annotations

from pathlib import Path

import numpy as np

from prep_build_dataset import build_dataset
from arc_solver.canonical import canonicalize_pair


def test_build_dataset(tmp_path: Path) -> None:
X, Y = build_dataset(output_dir=tmp_path)
assert len(X) == len(Y) > 0
x_path = tmp_path / "train_X.npy"
y_path = tmp_path / "train_Y.npy"
assert x_path.exists() and y_path.exists()
cx, cy = canonicalize_pair(X[0], Y[0])
assert np.array_equal(cx, X[0])
assert np.array_equal(cy, Y[0])
Loading
Loading