Add --task and --task-category filtering for eval command (#72)

jbragg · web-flow · commit f8ad869c50e5 · 2026-01-14T09:55:45.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,9 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
   "click",
-  "inspect-ai>=0.3.104",
+  # Pin inspect-ai to avoid breaking changes in 0.3.137+ (Event classes moved)
+  # See allenai/astabench-issues#275 for upgrade process
+  "inspect-ai>=0.3.104,<0.3.137",
   # pin litellm so that we know what model costs we're using
   # see the Development.md doc before changing
   "litellm>=1.67.4.post1,<=1.75.8",
diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
@@ -972,6 +972,18 @@ def view_command(
     help="Display format. Defaults to plain.",
     default="plain",
 )
+@click.option(
+    "--task",
+    "task_filters",
+    multiple=True,
+    help="Filter to only run tasks whose name contains this string (can be specified multiple times).",
+)
+@click.option(
+    "--task-category",
+    "task_category_filters",
+    multiple=True,
+    help="Filter to only run tasks with this tag (can be specified multiple times).",
+)
 @click.argument("args", nargs=-1, type=click.UNPROCESSED)
 def eval_command(
     log_dir: str | None,
@@ -980,12 +992,43 @@ def eval_command(
     ignore_git: bool,
     config_only: bool,
     display: str,
+    task_filters: tuple[str, ...],
+    task_category_filters: tuple[str, ...],
     args: tuple[str],
 ):
     """Run inspect eval-set with arguments and append tasks"""
     suite_config = load_suite_config(config_path)
     tasks = suite_config.get_tasks(split)
 
+    # Apply task filtering
+    if task_filters or task_category_filters:
+        original_count = len(tasks)
+        filtered_tasks = []
+        for task in tasks:
+            # Check task name filter (substring match)
+            if task_filters:
+                name_match = any(f in task.name for f in task_filters)
+                if not name_match:
+                    continue
+
+            # Check task category filter (exact tag match)
+            if task_category_filters:
+                task_tags = task.get_tag_names()
+                category_match = any(cat in task_tags for cat in task_category_filters)
+                if not category_match:
+                    continue
+
+            filtered_tasks.append(task)
+
+        tasks = filtered_tasks
+        click.echo(f"Filtered to {len(tasks)} of {original_count} tasks")
+
+        if not tasks:
+            raise click.ClickException(
+                "No tasks match the specified filters. "
+                f"Task filters: {task_filters}, Category filters: {task_category_filters}"
+            )
+
     # Verify git status for reproducibility
     if not ignore_git:
         verify_git_reproducibility()
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,3 +1,6 @@
+import os
+import tempfile
+
 from click.testing import CliRunner
 
 from agenteval.cli import cli
@@ -8,3 +11,261 @@ def test_help_displays_usage():
     result = runner.invoke(cli, ["--help"])
     assert result.exit_code == 0
     assert "Usage:" in result.output
+
+
+class TestEvalTaskFiltering:
+    """Tests for --task and --task-category filtering in eval command."""
+
+    def _create_test_config(self, tmpdir):
+        """Create a test config file with multiple tasks and tags."""
+        config_content = """
+name: test-suite
+version: "1.0.0"
+splits:
+  - name: test
+    tasks:
+      - name: ArxivDIGESTables_Clean_train
+        path: tasks/arxiv_clean
+        primary_metric: accuracy
+        tags:
+          - lit
+          - data
+      - name: CodeGenTask_v1
+        path: tasks/codegen
+        primary_metric: pass_rate
+        tags:
+          - code
+      - name: DiscoveryBenchmark_2024
+        path: tasks/discovery
+        primary_metric: f1_score
+        tags:
+          - discovery
+          - lit
+"""
+        config_path = os.path.join(tmpdir, "test_config.yml")
+        with open(config_path, "w") as f:
+            f.write(config_content)
+        return config_path
+
+    def test_eval_shows_task_filter_options_in_help(self):
+        """Test that --task and --task-category options appear in eval help."""
+        runner = CliRunner()
+        result = runner.invoke(cli, ["eval", "--help"])
+        assert result.exit_code == 0
+        assert "--task TEXT" in result.output
+        assert "--task-category TEXT" in result.output
+        assert "Filter to only run tasks whose name contains" in result.output
+        assert "Filter to only run tasks with this tag" in result.output
+
+    def test_eval_filter_by_task_name(self):
+        """Test filtering by task name substring."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task",
+                    "CodeGen",
+                ],
+            )
+
+            assert result.exit_code == 0
+            assert "Filtered to 1 of 3 tasks" in result.output
+            assert "tasks/codegen" in result.output
+            assert "tasks/arxiv_clean" not in result.output
+            assert "tasks/discovery" not in result.output
+
+    def test_eval_filter_by_task_category(self):
+        """Test filtering by task category/tag."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task-category",
+                    "lit",
+                ],
+            )
+
+            assert result.exit_code == 0
+            assert "Filtered to 2 of 3 tasks" in result.output
+            assert "tasks/arxiv_clean" in result.output
+            assert "tasks/discovery" in result.output
+            assert "tasks/codegen" not in result.output
+
+    def test_eval_filter_by_task_and_category_combined(self):
+        """Test filtering by both task name and category (AND logic)."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task",
+                    "Arxiv",
+                    "--task-category",
+                    "lit",
+                ],
+            )
+
+            assert result.exit_code == 0
+            assert "Filtered to 1 of 3 tasks" in result.output
+            assert "tasks/arxiv_clean" in result.output
+            # Discovery has "lit" tag but doesn't match "Arxiv"
+            assert "tasks/discovery" not in result.output
+            assert "tasks/codegen" not in result.output
+
+    def test_eval_filter_multiple_task_names(self):
+        """Test filtering with multiple --task options (OR logic within names)."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task",
+                    "CodeGen",
+                    "--task",
+                    "Discovery",
+                ],
+            )
+
+            assert result.exit_code == 0
+            assert "Filtered to 2 of 3 tasks" in result.output
+            assert "tasks/codegen" in result.output
+            assert "tasks/discovery" in result.output
+            assert "tasks/arxiv_clean" not in result.output
+
+    def test_eval_filter_multiple_categories(self):
+        """Test filtering with multiple --task-category options (OR logic)."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task-category",
+                    "code",
+                    "--task-category",
+                    "discovery",
+                ],
+            )
+
+            assert result.exit_code == 0
+            assert "Filtered to 2 of 3 tasks" in result.output
+            assert "tasks/codegen" in result.output
+            assert "tasks/discovery" in result.output
+            assert "tasks/arxiv_clean" not in result.output
+
+    def test_eval_no_filter_runs_all_tasks(self):
+        """Test that no filter runs all tasks."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                ],
+            )
+
+            assert result.exit_code == 0
+            # Should not show "Filtered to" message
+            assert "Filtered to" not in result.output
+            assert "tasks/arxiv_clean" in result.output
+            assert "tasks/codegen" in result.output
+            assert "tasks/discovery" in result.output
+
+    def test_eval_filter_no_matches_fails(self):
+        """Test that filtering with no matches raises an error."""
+        runner = CliRunner()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_path = self._create_test_config(tmpdir)
+            log_dir = os.path.join(tmpdir, "logs")
+
+            result = runner.invoke(
+                cli,
+                [
+                    "eval",
+                    "--config-path",
+                    config_path,
+                    "--split",
+                    "test",
+                    "--ignore-git",
+                    "--config-only",
+                    "--log-dir",
+                    log_dir,
+                    "--task",
+                    "NonExistentTask",
+                ],
+            )
+
+            assert result.exit_code != 0
+            assert "No tasks match the specified filters" in result.output