Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data/reference_sets/images/2407.00611v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2502.00527v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2502.11018v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2503.14905v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2504.11651v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.09371v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.11032v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.14674v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.18110v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.19547v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2505.21724v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.00070v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.01420v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.04536v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.07001v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.23075v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2506.23135v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2509.15193v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2509.21930v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2510.10790v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2510.22981v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2510.24039v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2512.03000v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2512.03127v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data/reference_sets/images/2602.00862v1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
268 changes: 262 additions & 6 deletions data/reference_sets/index.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion prompts/diagram/retriever.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ You must select the **Top {num_examples} candidates** that would be most helpful
Your goal is to find examples that match the Target in both **Domain** and **Diagram Type**.

**1. Match Research Topic (Use Methodology & Caption):**
* What is the domain? (e.g., Agent & Reasoning, Vision & Perception, Generative & Learning, Science & Applications).
* What is the domain? (e.g., Agent & Reasoning, Vision & Perception, Generative & Learning, Science & Applications, Systems & Networking, NLP & Language, Robotics & Control, Multimodal Fusion, Optimization & Theory, Healthcare & Medical).
* Select candidates that belong to the **same research domain**.
* *Why?* Similar domains share similar terminology (e.g., "Actor-Critic" in RL).

Expand Down
85 changes: 85 additions & 0 deletions tests/test_pipeline/test_reference_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from __future__ import annotations

import json
from pathlib import Path

import pytest

from paperbanana.core.types import ReferenceExample
from paperbanana.reference.store import ReferenceStore

REPO_ROOT = Path(__file__).resolve().parents[2]
INDEX_DIR = REPO_ROOT / "data" / "reference_sets"
INDEX_PATH = INDEX_DIR / "index.json"

EXPECTED_TOTAL = 38
EXPECTED_CATEGORIES = {
"agent_reasoning",
"generative_learning",
"healthcare_medical",
"multimodal_fusion",
"nlp_language",
"optimization_theory",
"robotics_control",
"science_applications",
"systems_networking",
"vision_perception",
}


@pytest.fixture()
def index_data() -> dict:
with open(INDEX_PATH, encoding="utf-8") as f:
return json.load(f)


class TestIndexJson:
def test_index_file_exists(self):
assert INDEX_PATH.exists(), f"{INDEX_PATH} is missing"

def test_total_examples_matches(self, index_data: dict):
examples = index_data["examples"]
declared = index_data["metadata"]["total_examples"]
assert len(examples) == declared
assert len(examples) == EXPECTED_TOTAL

def test_categories_complete(self, index_data: dict):
declared = set(index_data["metadata"]["categories"])
assert declared == EXPECTED_CATEGORIES

def test_no_duplicate_ids(self, index_data: dict):
ids = [e["id"] for e in index_data["examples"]]
assert len(ids) == len(set(ids)), f"Duplicate IDs: {[x for x in ids if ids.count(x) > 1]}"

def test_every_entry_has_source_paper(self, index_data: dict):
for e in index_data["examples"]:
assert "source_paper" in e and e["source_paper"], (
f"Entry {e['id']} missing source_paper"
)

def test_every_category_assigned(self, index_data: dict):
assigned = {e["category"] for e in index_data["examples"] if e.get("category")}
missing = EXPECTED_CATEGORIES - assigned
assert not missing, f"Categories with zero entries: {missing}"

def test_image_files_exist(self, index_data: dict):
missing = []
for e in index_data["examples"]:
img = INDEX_DIR / e["image_path"]
if not img.exists():
missing.append(e["id"])
assert not missing, f"Missing images for: {missing}"

def test_entries_parse_as_reference_example(self, index_data: dict):
for e in index_data["examples"]:
ReferenceExample(**e)

def test_reference_store_loads_all(self):
store = ReferenceStore(str(INDEX_DIR))
assert store.count == EXPECTED_TOTAL

def test_reference_store_category_counts(self):
store = ReferenceStore(str(INDEX_DIR))
for cat in EXPECTED_CATEGORIES:
entries = store.get_by_category(cat)
assert len(entries) > 0, f"No entries for category: {cat}"
Loading