Azure · 0xm00n · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 5, 2025
diff --git a/doc/api.rst b/doc/api.rst
@@ -124,6 +124,7 @@ API Reference
     :toctree: _autosummary/
 
     fetch_adv_bench_dataset
+    fetch_anthropic_evals_dataset
     fetch_aya_redteaming_dataset
     fetch_babelscape_alert_dataset
     fetch_ccp_sensitive_prompts_dataset

diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
+from pyrit.datasets.anthropic_evals_dataset import fetch_anthropic_evals_dataset
 from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
 from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
 from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset
@@ -39,6 +40,7 @@
 
 __all__ = [
     "fetch_adv_bench_dataset",
+    "fetch_anthropic_evals_dataset",
     "fetch_aya_redteaming_dataset",
     "fetch_babelscape_alert_dataset",
     "fetch_ccp_sensitive_prompts_dataset",

diff --git a/pyrit/datasets/anthropic_evals_dataset.py b/pyrit/datasets/anthropic_evals_dataset.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+from typing import Optional, List, Dict
+from pathlib import Path
+
+import requests
+
+from pyrit.datasets.dataset_helper import fetch_examples
+from pyrit.models import SeedDataset, SeedPrompt
+
+logger = logging.getLogger(__name__)
+
+VALID_CATEGORIES = ["persona", "sycophancy", "advanced-ai-risk", "winogenerated"]
+
+
+def fetch_anthropic_evals_dataset(
+    category: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+) -> SeedDataset:
+    """
+    Fetch Anthropic's model-written-evals dataset from GitHub.
+
+    The model-written-evals dataset contains evaluations for testing LLM behaviors
+    across 4 main categories with 154 total evaluation datasets.
+
+    Reference: https://arxiv.org/abs/2212.09251
+    GitHub: https://github.com/anthropics/evals
+
+    Args:
+        category: Optional category to filter. Available categories:
+            "persona", "sycophancy", "advanced-ai-risk", "winogenerated"
+            Default: None (returns all categories)
+        cache_dir: Optional cache directory for downloaded files
+
+    Returns:
+        SeedDataset containing Anthropic eval questions with metadata.
+
+    Raises:
+        ValueError: If invalid category is provided or dataset is empty.
+    """
+    if category is not None and category not in VALID_CATEGORIES:
+        raise ValueError(
+            f"Invalid category '{category}'. Must be one of: {', '.join(VALID_CATEGORIES)}"
+        )
+
+    categories_to_fetch = [category] if category else VALID_CATEGORIES
+
+    try:
+        logger.info("Loading Anthropic model-written-evals dataset")
+
+        common_metadata = {
+            "dataset_name": "Anthropic Model-Written Evals",
+            "authors": ["Anthropic"],
+            "description": "Evaluations for testing LLM behaviors across persona, sycophancy, AI risk, and bias",
+            "source": "https://github.com/anthropics/evals",
+            "data_type": "text",
+            "name": "Anthropic Evals",
+        }
+
+        seed_prompts = []
+
+        for cat in categories_to_fetch:
+            logger.info(f"Fetching files from category: {cat}")
+            files = _get_category_files(cat)
+
+            for file_info in files:
+                file_name = file_info["name"]
+                if file_name.endswith(".jsonl"):
+                   file_url = file_info["download_url"]
+                   eval_name = file_name.removesuffix(".jsonl")
+
+                   logger.info(f"Loading evaluation: {eval_name}")
+
+                    examples = fetch_examples(
+                       source=file_url,
+                       source_type="public_url",
+                       cache=True,
+                       data_home=Path(cache_dir) if cache_dir else None,
+                   )
+
+                   for item in examples:
+                       question = item.get("question", "").strip()
+                       if not question:
+                           logger.warning(f"Skipping item with empty question in {file_name}")
+                        else"
+                           answer_matching = item.get("answer_matching_behavior", "")
+                           answer_not_matching = item.get("answer_not_matching_behavior", "")
+
+                            seed_prompt = SeedPrompt(
+                               value=question,
+                               harm_categories=[],
+                              groups=[eval_name],
+                              metadata={
+                                "category": cat,
+                                 "answer_matching_behavior": answer_matching,
+                                 "answer_not_matching_behavior": answer_not_matching,
+                             },
+                            **common_metadata,  # type: ignore[arg-type]
+                    )
+
+                    seed_prompts.append(seed_prompt)
+
+        if not seed_prompts:
+            raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")
+
+        logger.info(f"Successfully loaded {len(seed_prompts)} prompts from Anthropic evals dataset")
+
+        return SeedDataset(prompts=seed_prompts)
+
+    except Exception as e:
+        logger.error(f"Failed to load Anthropic evals dataset: {str(e)}")
+        raise
+
+
+def _get_category_files(category: str) -> List[Dict[str, str]]:
+    """
+    Get list of files in a category directory from GitHub API.
+
+    Args:
+        category: The category directory name
+
+    Returns:
+        List of file info dictionaries with 'name' and 'download_url' keys
+    """
+    api_url = f"https://api.github.com/repos/anthropics/evals/contents/{category}"
+
+    response = requests.get(api_url)
+    if response.status_code != 200:
+        raise Exception(f"Failed to fetch file list for category '{category}'. Status code: {response.status_code}")
+
+    files = response.json()
+    return [{"name": f["name"], "download_url": f["download_url"]} for f in files if f["type"] == "file"]
diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py
@@ -5,6 +5,7 @@
 
 from pyrit.datasets import (
     fetch_adv_bench_dataset,
+    fetch_anthropic_evals_dataset,
     fetch_aya_redteaming_dataset,
     fetch_babelscape_alert_dataset,
     fetch_ccp_sensitive_prompts_dataset,
@@ -40,6 +41,7 @@
     "fetch_function, is_seed_dataset",
     [
         (fetch_adv_bench_dataset, True),
+        (fetch_anthropic_evals_dataset, True),
         (fetch_aya_redteaming_dataset, True),
         (fetch_babelscape_alert_dataset, True),
         (fetch_ccp_sensitive_prompts_dataset, True),