Azure · 0xm00n · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 5, 2025
diff --git a/doc/api.rst b/doc/api.rst
@@ -124,6 +124,7 @@ API Reference
     :toctree: _autosummary/
 
     fetch_adv_bench_dataset
+    fetch_anthropic_evals_dataset
     fetch_aya_redteaming_dataset
     fetch_babelscape_alert_dataset
     fetch_ccp_sensitive_prompts_dataset
@@ -145,6 +146,7 @@ API Reference
     fetch_multilingual_vulnerability_dataset
     fetch_pku_safe_rlhf_dataset
     fetch_seclists_bias_testing_dataset
+    fetch_sorry_bench_dataset
     fetch_sosbench_dataset
     fetch_tdc23_redteaming_dataset
     fetch_wmdp_dataset

diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
+from pyrit.datasets.anthropic_evals_dataset import fetch_anthropic_evals_dataset
 from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
 from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
 from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset
@@ -34,10 +35,12 @@
     fetch_jbb_behaviors_by_harm_category,
     fetch_jbb_behaviors_by_jbb_category,
 )
+from pyrit.datasets.sorry_bench_dataset import fetch_sorry_bench_dataset
 
 
 __all__ = [
     "fetch_adv_bench_dataset",
+    "fetch_anthropic_evals_dataset",
     "fetch_aya_redteaming_dataset",
     "fetch_babelscape_alert_dataset",
     "fetch_ccp_sensitive_prompts_dataset",
@@ -58,6 +61,7 @@
     "fetch_red_team_social_bias_dataset",
     "fetch_seclists_bias_testing_dataset",
     "fetch_sosbench_dataset",
+    "fetch_sorry_bench_dataset",
     "fetch_tdc23_redteaming_dataset",
     "fetch_transphobia_awareness_dataset",
     "fetch_wmdp_dataset",

diff --git a/pyrit/datasets/anthropic_evals_dataset.py b/pyrit/datasets/anthropic_evals_dataset.py
@@ -0,0 +1,139 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+from typing import Optional, List, Dict
+from pathlib import Path
+
+import requests
+
+from pyrit.datasets.dataset_helper import fetch_examples
+from pyrit.models import SeedDataset, SeedPrompt
+
+logger = logging.getLogger(__name__)
+
+VALID_CATEGORIES = ["persona", "sycophancy", "advanced-ai-risk", "winogenerated"]
+
+
+def fetch_anthropic_evals_dataset(
+    category: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+) -> SeedDataset:
+    """
+    Fetch Anthropic's model-written-evals dataset from GitHub.
+
+    The model-written-evals dataset contains evaluations for testing LLM behaviors
+    across 4 main categories with 154 total evaluation datasets.
+
+    Reference: https://arxiv.org/abs/2212.09251
+    GitHub: https://github.com/anthropics/evals
+
+    Args:
+        category: Optional category to filter. Available categories:
+            "persona", "sycophancy", "advanced-ai-risk", "winogenerated"
+            Default: None (returns all categories)
+        cache_dir: Optional cache directory for downloaded files
+
+    Returns:
+        SeedDataset containing Anthropic eval questions with metadata.
+
+    Raises:
+        ValueError: If invalid category is provided or dataset is empty.
+    """
+    if category is not None and category not in VALID_CATEGORIES:
+        raise ValueError(
+            f"Invalid category '{category}'. Must be one of: {', '.join(VALID_CATEGORIES)}"
+        )
+
+    categories_to_fetch = [category] if category else VALID_CATEGORIES
+
+    try:
+        logger.info("Loading Anthropic model-written-evals dataset")
+
+        common_metadata = {
+            "dataset_name": "Anthropic Model-Written Evals",
+            "authors": ["Anthropic"],
+            "description": "Evaluations for testing LLM behaviors across persona, sycophancy, AI risk, and bias",
+            "source": "https://github.com/anthropics/evals",
+            "data_type": "text",
+            "name": "Anthropic Evals",
+        }
+
+        seed_prompts = []
+
+        for cat in categories_to_fetch:
+            logger.info(f"Fetching files from category: {cat}")
+            files = _get_category_files(cat)
+
+            for file_info in files:
+                file_name = file_info["name"]
+                if not file_name.endswith(".jsonl"):
+                    continue
+
+                file_url = file_info["download_url"]
+                eval_name = file_name.removesuffix(".jsonl")
+
+                logger.info(f"Loading evaluation: {eval_name}")
+
+                examples = fetch_examples(
+                    source=file_url,
+                    source_type="public_url",
+                    cache=True,
+                    data_home=Path(cache_dir) if cache_dir else None,
+                )
+
+                for item in examples:
+                    question = item.get("question", "").strip()
+                    if not question:
+                        logger.warning(f"Skipping item with empty question in {file_name}")
+                        continue
+
+                    answer_matching = item.get("answer_matching_behavior", "")
+                    answer_not_matching = item.get("answer_not_matching_behavior", "")
+
+                    seed_prompt = SeedPrompt(
+                        value=question,
+                        harm_categories=[],
+                        groups=[eval_name],
+                        metadata={
+                            "category": cat,
+                            "answer_matching_behavior": answer_matching,
+                            "answer_not_matching_behavior": answer_not_matching,
+                        },
+                        **common_metadata,  # type: ignore[arg-type]
+                    )
+
+                    seed_prompts.append(seed_prompt)
+
+        if not seed_prompts:
+            raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")
+
+        logger.info(f"Successfully loaded {len(seed_prompts)} prompts from Anthropic evals dataset")
+
+        return SeedDataset(prompts=seed_prompts)
+
+    except ValueError:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to load Anthropic evals dataset: {str(e)}")
+        raise
+
+
+def _get_category_files(category: str) -> List[Dict[str, str]]:
+    """
+    Get list of files in a category directory from GitHub API.
+
+    Args:
+        category: The category directory name
+
+    Returns:
+        List of file info dictionaries with 'name' and 'download_url' keys
+    """
+    api_url = f"https://api.github.com/repos/anthropics/evals/contents/{category}"
+
+    response = requests.get(api_url)
+    if response.status_code != 200:
+        raise Exception(f"Failed to fetch file list for category '{category}'. Status code: {response.status_code}")
+
+    files = response.json()
+    return [{"name": f["name"], "download_url": f["download_url"]} for f in files if f["type"] == "file"]
diff --git a/pyrit/datasets/sorry_bench_dataset.py b/pyrit/datasets/sorry_bench_dataset.py
@@ -0,0 +1,193 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+from typing import List, Optional
+
+from datasets import load_dataset
+
+from pyrit.models import SeedDataset, SeedPrompt
+
+logger = logging.getLogger(__name__)
+
+
+VALID_CATEGORIES = [
+    "Personal Insulting Words",
+    "Social-group Insulting Words",
+    "Threatening Words",
+    "Lewd and Obscene Words",
+    "Libelous Words",
+    "Self-Harm",
+    "Violent Crimes",
+    "Harassment",
+    "Sexual Crimes",
+    "Property Crimes",
+    "Public Order Crimes",
+    "System Intrusion",
+    "Malware Code Generation",
+    "Fraud",
+    "Financial Crimes",
+    "IP Infringement",
+    "PII Violations",
+    "Illegal Crafting",
+    "Terrorism",
+    "Child-related Crimes",
+    "Animal-related Crimes",
+    "Environmental Crimes",
+    "Evasion of Law Enforcement",
+    "Human Elimination",
+    "Advice on Adult Content",
+    "Sexual Explicit Content Generation",
+    "Non-sexual Explicit Content Generation",
+    "Fake News Generation",
+    "False Advertising",
+    "Advice on Discrimination Behaviors",
+    "Military Use",
+    "Political Belief Promotion",
+    "Ethical Belief Promotion",
+    "Religion Promotion",
+    "Social Stereotype Promotion",
+    "Extremist Content Promotion",
+    "Conspiracy Theories Promotion",
+    "False Common Knowledge Promotion",
+    "Harmful & Unhealthy Behaviors Promotion",
+    "Medical Advice",
+    "Financial Advice",
+    "Legal Consulting Advice",
+    "Governance Decision Advice",
+    "Dangerous Machinery Operation Advice",
+]
+
+VALID_PROMPT_STYLES = [
+    "base",
+    "ascii",
+    "atbash",
+    "authority_endorsement",
+    "caesar",
+    "evidence-based_persuasion",
+    "expert_endorsement",
+    "logical_appeal",
+    "misrepresentation",
+    "misspellings",
+    "morse",
+    "question",
+    "role_play",
+    "slang",
+    "technical_terms",
+    "translate-fr",
+    "translate-ml",
+    "translate-mr",
+    "translate-ta",
+    "translate-zh-cn",
+    "uncommon_dialects",
+]
+
+
+def fetch_sorry_bench_dataset(
+    cache_dir: Optional[str] = None,
+    categories: Optional[List[str]] = None,
+    prompt_style: Optional[str] = None,
+) -> SeedDataset:
+    """
+    Fetch Sorry-Bench dataset from Hugging Face (updated 2025/03 version).
+
+    The Sorry-Bench dataset contains adversarial prompts designed to test LLM safety
+    across 44 categories with 21 different prompt styles (base + 20 linguistic mutations).
+
+    Reference: https://arxiv.org/abs/2406.14598
+
+    Args:
+        cache_dir: Optional cache directory for Hugging Face datasets
+        categories: Optional list of categories to filter. Full list in:
+            https://huggingface.co/datasets/sorry-bench/sorry-bench-202503/blob/main/meta_info.py
+        prompt_style: Optional prompt style to filter. Available styles:
+            "base", "ascii", "caesar", "slang", "authority_endorsement", etc.
+            Default: "base" (only base prompts, no mutations)
+            Full list: https://huggingface.co/datasets/sorry-bench/sorry-bench-202503
+
+    Returns:
+        SeedDataset containing Sorry-Bench prompts with harm categories.
+
+    Raises:
+        ValueError: If invalid categories or prompt_style are provided.
+    """
+    if prompt_style is None:
+        prompt_style = "base"
+
+    if prompt_style not in VALID_PROMPT_STYLES:
+        raise ValueError(f"Invalid prompt_style '{prompt_style}'. Must be one of: {', '.join(VALID_PROMPT_STYLES)}")
+
+    if categories:
+        invalid_categories = [cat for cat in categories if cat not in VALID_CATEGORIES]
+        if invalid_categories:
+            raise ValueError(
+                f"Invalid categories: {invalid_categories}. Must be from the list of 44 valid categories. "
+                f"See: https://huggingface.co/datasets/sorry-bench/sorry-bench-202503/blob/main/meta_info.py"
+            )
+
+    try:
+        source = "sorry-bench/sorry-bench-202503"
+        logger.info(f"Loading Sorry-Bench dataset from {source}")
+        data = load_dataset(source, cache_dir=cache_dir)
+
+        dataset_split = data["train"]
+
+        common_metadata = {
+            "dataset_name": "Sorry-Bench",
+            "authors": ["Sorry-Bench Team"],
+            "description": "Adversarial prompts for testing LLM safety across 44 categories",
+            "source": source,
+            "data_type": "text",
+            "name": "Sorry-Bench 2025-03",
+        }
+
+        seed_prompts = []
+
+        for item in dataset_split:
+            category = item.get("category", "")
+            item_prompt_style = item.get("prompt_style", "")
+            turns = item.get("turns", [])
+            question_id = item.get("question_id")
+
+            if not turns:
+                logger.warning(f"Skipping item {question_id} with empty turns")
+                continue
+
+            prompt_text = turns[0].strip()
+
+            if not prompt_text:
+                logger.warning(f"Skipping item {question_id} with empty prompt text")
+                continue
+
+            if categories and category not in categories:
+                continue
+
+            if prompt_style != item_prompt_style:
+                continue
+
+            seed_prompt = SeedPrompt(
+                value=prompt_text,
+                harm_categories=[category],
+                groups=[item_prompt_style] if item_prompt_style else [],
+                metadata={
+                    "sorry_bench_category": category,
+                    "prompt_style": item_prompt_style,
+                    "question_id": question_id,
+                },
+                **common_metadata,  # type: ignore[arg-type]
+            )
+
+            seed_prompts.append(seed_prompt)
+
+        if not seed_prompts:
+            raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")
+
+        logger.info(f"Successfully loaded {len(seed_prompts)} prompts from Sorry-Bench dataset")
+
+        return SeedDataset(prompts=seed_prompts)
+
+    except ValueError:
+        raise
+    except Exception as e:
+        logger.error(f"Failed to load Sorry-Bench dataset: {str(e)}")
+        raise