Skip tasks with insufficient samples for pass@k instead of raising ex… (#113)

Ivan Evtimov · facebook-github-bot · commit bc802d1eb528 · 2026-02-06T09:58:52.000-08:00
Summary:
When computing pass@k metrics with k &gt; 1, tasks that have fewer than k samples are now gracefully skipped with a warning log message instead of raising a ValueError that would terminate the entire results processing.

Changes:
- Replace ValueError with warning log when n_samples &lt; k for a task
- Add logging module import and logger instance
- Collect skipped groups and log them with full context (dataset, agent, attack, task_id, and sample count)
- Add check for empty DataFrame after filtering in aggregate_results
- Update docstrings to reflect new behavior (Note instead of Raises)
- Also includes refactoring: remove job_name from group_cols to allow aggregating across multiple runs of the same experiment
- Add generic variant_name support alongside legacy template_short_name


Differential Revision: D92393526

Pulled By: evtimovi
diff --git a/src/prompt_siren/results.py b/src/prompt_siren/results.py
@@ -9,6 +9,7 @@
 
 import itertools
 import json
+import logging
 import sys
 from enum import auto
 from pathlib import Path
@@ -51,6 +52,8 @@ class Format(StrEnum):
 # Note: job_name is not included to allow grouping across jobs with the same agent/attack config
 _ALL_GROUP_COLS = ["dataset", "agent_type", "agent_name", "attack_type"]
 
+logger = logging.getLogger(__name__)
+
 
 def estimate_pass_at_k(num_samples: int | list[int], num_correct: list[int], k: int) -> np.ndarray:
     """Estimates pass@k of each problem and returns them in an array.
@@ -117,11 +120,12 @@ def _parse_index_entry(line: str, job_config: JobConfig) -> dict[str, Any]:
         row["attack_type"] = attack_type
         row["attack_config"] = attack_config
 
-        # For template_string attacks, append the template_short_name
-        if attack_type == "template_string" and attack_config:
-            template_short_name = attack_config.get("template_short_name")
-            if template_short_name:
-                row["attack_type"] = f"template_string_{template_short_name}"
+        # Check for variant_name (generic) or template_short_name (legacy for template_string)
+        variant_name = attack_config.get("variant_name") if attack_config else None
+        if not variant_name and attack_type == "template_string" and attack_config:
+            variant_name = attack_config.get("template_short_name")
+        if variant_name:
+            row["attack_type"] = f"{attack_type}_{variant_name}"
     else:
         row["attack_type"] = "benign"
         row["attack_config"] = None
@@ -225,8 +229,9 @@ def _group_by_task(df: pd.DataFrame, k: int = 1) -> pd.DataFrame:
         return df
 
     # Group by configuration and task
-    # Include dataset_suite and job_name to disambiguate tasks from different jobs
-    group_cols = [*_ALL_GROUP_COLS, "dataset_suite", "job_name", "task_id"]
+    # Note: job_name is NOT included to allow aggregating across multiple runs
+    # of the same experiment (e.g., for pass@k computation)
+    group_cols = [*_ALL_GROUP_COLS, "dataset_suite", "task_id"]
 
     if k == 1:
         # Original behavior: average across timestamps
@@ -241,15 +246,17 @@ def _group_by_task(df: pd.DataFrame, k: int = 1) -> pd.DataFrame:
 
     # For k > 1: compute pass@k metric
     results = []
+    skipped_groups: list[dict[str, Any]] = []
     for group_key, group in df.groupby(group_cols):
         n_samples = len(group)
 
-        # Error if we don't have enough samples
+        # Skip groups with insufficient samples and log a warning
         if n_samples < k:
-            task_id = group["task_id"].iloc[0]
-            raise ValueError(
-                f"Task '{task_id}' has only {n_samples} samples but k={k}. Need at least k samples to compute pass@{k}."
-            )
+            key_tuple = group_key if isinstance(group_key, tuple) else (group_key,)
+            group_info = dict(zip(group_cols, key_tuple, strict=True))
+            group_info["n_samples"] = n_samples
+            skipped_groups.append(group_info)
+            continue
 
         # Count number of correct samples (score = 1.0)
         n_benign_correct = (group["benign_score"] == 1.0).sum()
@@ -272,6 +279,17 @@ def _group_by_task(df: pd.DataFrame, k: int = 1) -> pd.DataFrame:
         result_row["n_samples"] = n_samples
         results.append(result_row)
 
+    # Log warnings for skipped groups
+    if skipped_groups:
+        for group_info in skipped_groups:
+            n_samples = group_info.pop("n_samples")
+            # Format group identifiers: dataset, agent_type, agent_name, attack_type, task_id
+            group_str = ", ".join(f"{key}={value}" for key, value in group_info.items())
+            logger.warning(
+                f"Skipping group ({group_str}): has only {n_samples} samples but k={k}. "
+                f"Need at least k samples to compute pass@{k}."
+            )
+
     return pd.DataFrame(results)
 
 
@@ -306,8 +324,8 @@ def aggregate_results(
         benign_pass@k, attack_pass@k, n_tasks, avg_n_samples
         (aggregates across dataset_suite's and job_name variations)
 
-    Raises:
-        ValueError: If any task has fewer than k samples when k > 1
+    Note:
+        Groups with fewer than k samples are excluded and a warning is logged.
     """
     # Convert single k to list for uniform handling
     k_values = [k] if isinstance(k, int) else k
@@ -341,6 +359,10 @@ def aggregate_results(
     # Stage 1: Always group by task (computing pass@k)
     df = _group_by_task(df, k=k_value)
 
+    # If all groups were filtered out due to insufficient samples, return empty DataFrame
+    if df.empty:
+        return pd.DataFrame()
+
     # Determine score column names based on k
     benign_col = f"benign_pass@{k_value}"
     attack_col = f"attack_pass@{k_value}"