agentscope-ai
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README_zh.md‎
Lines changed: 2 additions & 0 deletions b/‎README_zh.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cookbooks/ref_hallucination_arena/__main__.py‎
Lines changed: 88 additions & 0 deletions b/‎cookbooks/ref_hallucination_arena/__main__.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎cookbooks/ref_hallucination_arena/collectors/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎cookbooks/ref_hallucination_arena/collectors/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎cookbooks/ref_hallucination_arena/collectors/bib_extractor.py‎
Lines changed: 206 additions & 0 deletions b/‎cookbooks/ref_hallucination_arena/collectors/bib_extractor.py‎
Lines changed: 206 additions & 0 deletions
@@ -47,14 +47,14 @@ It can also convert grading results into **reward signals** to help you **fine-t
 
 ## News
 
+- **2026-02-12** - 📚 **Reference Hallucination Arena** - Benchmark for evaluating LLM academic reference hallucination. 👉 [Documentation](./docs/validating_graders/ref_hallucination_arena.md)
+
 - **2026-01-27** - 🆕 **Paper Review** - Automatically review academic papers using LLM-powered evaluation. 👉 [Documentation](https://agentscope-ai.github.io/OpenJudge/applications/paper_review/)
 
 - **2026-01-27** - 🖥️ **OpenJudge UI** - A Streamlit-based visual interface for grader testing and Auto Arena. Run `streamlit run ui/app.py` to get started.
 
 - **2026-01-05** - 🏟️ **Auto Arena** - Automatically evaluate and compare multiple models without pre-existing test data. 👉 [Documentation](https://agentscope-ai.github.io/OpenJudge/applications/auto_arena/)
 
-- **2025-12-26** - Released OpenJudge v0.2.0 on [PyPI](https://pypi.org/project/py-openjudge/) → [migration-guide](#migration-guide-v01x--v020)
-
 ---
 
 ## ✨ Key Features
 
@@ -112,6 +112,8 @@ OpenJudge 提供**即用型评分器**，并支持生成**场景特定的评估
 ----
 ## 最新动态
 
+- **2026-02-12** - 📚 **Reference Hallucination Arena** - 评估大语言模型学术引用幻觉的基准测试。 👉 [文档](./docs/validating_graders/ref_hallucination_arena.md)
+
 - **2025-12-26** - 在 [PyPI](https://pypi.org/project/py-openjudge/) 上发布 OpenJudge v0.2.0 - **重大更新！** 此版本通过在奖励构建之上添加对多样化评估场景的强大支持，扩展了我们的核心能力。通过统一奖励和评估信号，OpenJudge v0.2.0 提供了一种更全面的方法来优化应用性能和卓越性。→ [迁移指南](#迁移指南v01x--v020)
 
 - **2025-10-20** - [Auto-Rubric: Learning to Extract Generalizable Criteria for Reward Modeling](https://arxiv.org/abs/2510.17314) - 我们发布了一篇关于学习可泛化奖励标准以实现稳健建模的新论文。
 
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""CLI entry point for Reference Hallucination Arena.
+
+Usage:
+    python -m cookbooks.ref_hallucination_arena --config config.yaml
+    python -m cookbooks.ref_hallucination_arena --config config.yaml --save
+    python -m cookbooks.ref_hallucination_arena --config config.yaml --fresh
+"""
+
+import asyncio
+from pathlib import Path
+from typing import Optional
+
+import fire
+from loguru import logger
+
+from cookbooks.ref_hallucination_arena.pipeline import RefArenaPipeline
+from cookbooks.ref_hallucination_arena.schema import RefArenaConfig, load_config
+
+
+async def _run_evaluation(
+    config: RefArenaConfig,
+    save: bool = False,
+    resume: bool = True,
+) -> None:
+    """Run the evaluation pipeline."""
+    pipeline = RefArenaPipeline(config=config, resume=resume)
+    result = await pipeline.evaluate()
+
+    if save:
+        pipeline.save_results(result)
+
+
+def main(
+    config: str,
+    output_dir: Optional[str] = None,
+    save: bool = False,
+    fresh: bool = False,
+) -> None:
+    """Reference Hallucination Arena CLI.
+
+    Evaluate LLM reference recommendation capabilities by verifying
+    recommended papers against Crossref, PubMed, arXiv, and DBLP.
+
+    Args:
+        config: Path to YAML configuration file.
+        output_dir: Output directory for results (overrides config).
+        save: Whether to save results to file.
+        fresh: Start fresh, ignore any existing checkpoint.
+
+    Examples:
+        # Normal run (auto-resumes from checkpoint)
+        python -m cookbooks.ref_hallucination_arena --config config.yaml --save
+
+        # Start fresh
+        python -m cookbooks.ref_hallucination_arena --config config.yaml --fresh --save
+    """
+    config_path = Path(config)
+    if not config_path.exists():
+        logger.error(f"Config file not found: {config}")
+        return
+
+    # Load config once and apply output_dir override
+    loaded_config = load_config(str(config_path))
+    if output_dir:
+        loaded_config.output.output_dir = output_dir
+
+    if fresh:
+        logger.info("Starting fresh (ignoring checkpoint)")
+        from cookbooks.ref_hallucination_arena.pipeline import CheckpointManager
+
+        CheckpointManager(loaded_config.output.output_dir).clear()
+    else:
+        logger.info("Resume mode enabled")
+
+    logger.info(f"Starting Reference Hallucination Arena with config: {config}")
+
+    asyncio.run(
+        _run_evaluation(
+            loaded_config,
+            save,
+            resume=not fresh,
+        )
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""Data collectors for Reference Hallucination Arena."""
+
+from cookbooks.ref_hallucination_arena.collectors.bib_extractor import BibExtractor
+from cookbooks.ref_hallucination_arena.collectors.response_collector import (
+    ResponseCollector,
+)
+
+__all__ = ["BibExtractor", "ResponseCollector"]
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+"""Extract BibTeX references from free-text model responses."""
+
+import re
+from typing import List, Optional
+
+from cookbooks.ref_hallucination_arena.schema import Reference
+
+
+class BibExtractor:
+    """Extract BibTeX entries from model responses.
+
+    Strategies (tried in order):
+      1. Extract content inside ```bib / ```bibtex code fences.
+      2. Extract standalone @type{...} entries scattered in the text.
+      3. Fallback: try to parse structured plain-text references.
+    """
+
+    # Matches ```bib or ```bibtex fenced code blocks
+    _FENCE_PATTERN = re.compile(
+        r"```(?:bib(?:tex)?)\s*\n(.*?)```",
+        re.DOTALL | re.IGNORECASE,
+    )
+
+    # Matches a full BibTeX entry: @type{key, ... }
+    # Uses brace-counting to handle nested braces correctly
+    _ENTRY_START_PATTERN = re.compile(
+        r"@(\w+)\s*\{\s*([^,\s]*)\s*,",
+        re.IGNORECASE,
+    )
+
+    def extract(self, response_text: str) -> List[Reference]:
+        """Extract references from a model response.
+
+        Args:
+            response_text: Raw text response from the model.
+
+        Returns:
+            List of extracted Reference objects.
+        """
+        if not response_text:
+            return []
+
+        # Strategy 1: fenced code blocks
+        fenced_content = self._extract_fenced(response_text)
+        if fenced_content:
+            refs = self._parse_bibtex(fenced_content)
+            if refs:
+                return refs
+
+        # Strategy 2: standalone entries in text
+        refs = self._parse_bibtex(response_text)
+        if refs:
+            return refs
+
+        # Strategy 3: plain-text fallback (numbered references)
+        return self._parse_plain_text(response_text)
+
+    def _extract_fenced(self, text: str) -> str:
+        """Extract content from ```bib/bibtex fenced blocks."""
+        blocks = self._FENCE_PATTERN.findall(text)
+        if blocks:
+            return "\n\n".join(blocks)
+        return ""
+
+    def _parse_bibtex(self, text: str) -> List[Reference]:
+        """Parse BibTeX entries using brace-counting for robustness."""
+        refs = []
+
+        for match in self._ENTRY_START_PATTERN.finditer(text):
+            entry_type = match.group(1).lower()
+            key = match.group(2).strip()
+
+            # Find the matching closing brace via counting
+            start = match.start()
+            brace_start = text.index("{", start)
+            fields_str = self._extract_braced_content(text, brace_start)
+            if fields_str is None:
+                continue
+
+            ref = self._parse_fields(key, entry_type, fields_str)
+            if ref:
+                refs.append(ref)
+
+        return refs
+
+    def _extract_braced_content(self, text: str, open_pos: int) -> Optional[str]:
+        """Extract content between matched braces starting at open_pos."""
+        depth = 0
+        for i in range(open_pos, len(text)):
+            if text[i] == "{":
+                depth += 1
+            elif text[i] == "}":
+                depth -= 1
+                if depth == 0:
+                    return text[open_pos + 1 : i]
+        return None  # unmatched
+
+    def _parse_fields(self, key: str, entry_type: str, fields_str: str) -> Optional[Reference]:
+        """Parse individual fields from BibTeX entry body."""
+
+        def extract_field(name: str) -> Optional[str]:
+            # Match field = {value}, field = "value", or field = number
+            # Try brace-delimited value first (handles nested braces)
+            brace_pattern = rf"{name}\s*=\s*\{{(.*?)\}}"
+            m = re.search(brace_pattern, fields_str, re.IGNORECASE | re.DOTALL)
+            if m:
+                return m.group(1).strip()
+            # Try quote-delimited value
+            quote_pattern = rf'{name}\s*=\s*"(.*?)"'
+            m = re.search(quote_pattern, fields_str, re.IGNORECASE | re.DOTALL)
+            if m:
+                return m.group(1).strip()
+            # Try unquoted numeric value (e.g., year = 2023)
+            num_pattern = rf"{name}\s*=\s*(\d+)"
+            m = re.search(num_pattern, fields_str, re.IGNORECASE)
+            if m:
+                return m.group(1).strip()
+            return None
+
+        title = extract_field("title")
+        if not title:
+            return None
+
+        # Extract arXiv ID
+        arxiv_id = None
+        journal = extract_field("journal") or extract_field("booktitle") or ""
+        eprint = extract_field("eprint")
+        if eprint:
+            arxiv_id = eprint
+        elif "arxiv" in journal.lower():
+            arxiv_match = re.search(r"(\d{4}\.\d{4,5})", journal)
+            if arxiv_match:
+                arxiv_id = arxiv_match.group(1)
+
+        # Extract PMID from note or url
+        pmid = None
+        note = extract_field("note") or ""
+        url = extract_field("url") or ""
+        pmid_match = re.search(r"(?:PMID|pmid)[:\s]*(\d+)", note + " " + url)
+        if pmid_match:
+            pmid = pmid_match.group(1)
+
+        return Reference(
+            key=key,
+            title=title,
+            authors=extract_field("author"),
+            year=extract_field("year"),
+            journal=journal,
+            doi=extract_field("doi"),
+            arxiv_id=arxiv_id,
+            pmid=pmid,
+            entry_type=entry_type,
+        )
+
+    def _parse_plain_text(self, text: str) -> List[Reference]:
+        """Fallback: parse numbered plain-text references.
+
+        Handles patterns like:
+          1. Author et al. (2023). "Title". Journal.
+          [1] Author et al., "Title", Journal, 2023.
+        """
+        refs = []
+
+        # Pattern: numbered reference with quoted title
+        patterns = [
+            # "1. Authors (Year). Title. Journal."
+            re.compile(
+                r"(?:^|\n)\s*(?:\d+[\.\)]\s*|[\[\(]\d+[\]\)]\s*)"
+                r"(.+?)\s*[\(\[]?(\d{4})[\)\]]?\s*[\.\,]\s*"
+                r'["\u201c](.+?)["\u201d]',
+                re.MULTILINE,
+            ),
+            # Simpler: "Title" (Year)
+            re.compile(
+                r'["\u201c](.+?)["\u201d]\s*[\(\[]?(\d{4})[\)\]]?',
+            ),
+        ]
+
+        seen_titles = set()
+        for pattern in patterns:
+            for m in pattern.finditer(text):
+                groups = m.groups()
+                if len(groups) >= 3:
+                    authors, year, title = groups[0], groups[1], groups[2]
+                elif len(groups) >= 2:
+                    title, year = groups[0], groups[1]
+                    authors = None
+                else:
+                    continue
+
+                title_lower = title.strip().lower()
+                if title_lower in seen_titles or len(title_lower) < 10:
+                    continue
+                seen_titles.add(title_lower)
+
+                refs.append(
+                    Reference(
+                        key=f"ref_{len(refs)+1}",
+                        title=title.strip(),
+                        authors=authors.strip() if authors else None,
+                        year=year.strip(),
+                    )
+                )
+
+        return refs