Skip to content

Commit 8503053

Browse files
configure monitor via CLI (chz)
1 parent 785c7e8 commit 8503053

6 files changed

Lines changed: 111 additions & 96 deletions

File tree

project/paperbench/paperbench/monitor/create_monitor.py

Lines changed: 0 additions & 26 deletions
This file was deleted.

project/paperbench/paperbench/monitor/monitor.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
13
import re
24
from abc import ABC, abstractmethod
35
from dataclasses import dataclass
46
from typing import Any
57

68
import blobfile as bf
79
import structlog.stdlib
10+
from pydantic import BaseModel
811
from unidecode import unidecode
912

1013
from paperbench.paper_registry import Paper
@@ -51,6 +54,14 @@ def to_dict(self) -> dict[str, Any]:
5154
class Monitor(ABC):
5255
"""Base class for monitoring agent behavior through logs."""
5356

57+
class Config(BaseModel, ABC):
58+
"""Serializable configuration for a :class:`Monitor`."""
59+
60+
@abstractmethod
61+
def build(self, paper: Paper) -> Monitor:
62+
"""Instantiate the monitor for the provided paper."""
63+
...
64+
5465
def __init__(
5566
self,
5667
paper: Paper,
@@ -74,12 +85,18 @@ def __init__(
7485
@abstractmethod
7586
def check_log(self, log_file: str) -> MonitorResult:
7687
"""Check a log file for violations of monitoring rules."""
77-
raise NotImplementedError()
88+
...
7889

7990

8091
class BasicMonitor(Monitor):
8192
"""Simple implementation that checks for occurrences of blacklisted terms with git clone, curl, or wget commands in agent logs."""
8293

94+
class Config(Monitor.Config):
95+
"""Configuration for :class:`BasicMonitor`."""
96+
97+
def build(self, paper: Paper) -> BasicMonitor:
98+
return BasicMonitor(paper=paper)
99+
83100
def _normalize_url(self, url: str) -> str:
84101
"""Normalize URL by removing protocol, parameters, and anchors."""
85102
# Remove protocol (http:// or https://)

project/paperbench/paperbench/nano/eval.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from nanoeval.solvers.computer_tasks.steps import FinalResult
1919
from nanoeval.solvers.computer_tasks.task import ComputerTask
2020
from paperbench.metrics import compute_agg_stats, per_paper_results
21+
from paperbench.monitor.monitor import BasicMonitor, Monitor
2122
from paperbench.nano.structs import (
2223
JudgeConfig,
2324
PaperBenchGrade,
@@ -48,6 +49,7 @@
4849
class PaperBench(PythonCodingEval):
4950
reproduction: ReproductionConfig = chz.field(default_factory=ReproductionConfig)
5051
judge: JudgeConfig = chz.field(default_factory=JudgeConfig)
52+
monitor_config: Monitor.Config = chz.field(default_factory=BasicMonitor.Config)
5153

5254
# task args
5355
paper_split: Literal["debug", "dev", "human", "testing", "all"] = chz.field(
@@ -153,6 +155,7 @@ async def get_instances(self) -> list[PBTask]:
153155
target_duration_hr=self.target_duration_hr,
154156
judge=self.judge,
155157
reproduction=self.reproduction,
158+
monitor_config=self.monitor_config,
156159
save_cluster_output_to_host=self.save_cluster_output_to_host,
157160
network_mode=NetworkMode.UNPROXIED
158161
if self.allow_internet

project/paperbench/paperbench/nano/task.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@
3838
WORKSPACE_BASE,
3939
)
4040
from paperbench.grade import JudgeOutput, grade_submission
41-
from paperbench.monitor.create_monitor import create_monitor
42-
from paperbench.monitor.monitor import MonitorResult
41+
from paperbench.monitor.monitor import Monitor, MonitorResult
4342
from paperbench.nano.structs import (
4443
JudgeConfig,
4544
PaperBenchGrade,
@@ -68,6 +67,7 @@ class PBTask(ComputerTask):
6867
target_duration_hr: int | None
6968
reproduction: ReproductionConfig
7069
judge: JudgeConfig
70+
monitor_config: Monitor.Config
7171
skipped_rollout: bool = False # whether rollouts were skipped (e.g. if we're resuming)
7272

7373
save_cluster_output_to_host: bool
@@ -334,10 +334,7 @@ def _should_monitor(self, log_file_path: str) -> bool:
334334
return False
335335

336336
def _run_monitor(self, log_file_path: str) -> MonitorResult:
337-
"""
338-
Runs the monitor on an given log file
339-
TODO: make this configurable through chz in `PaperBenchEval`
340-
"""
337+
"""Run the configured monitor on the given log file."""
341338
ctx_logger = logger.bind(
342339
run_group_id=self.run_group_id, run_id=self.run_id, runs_dir=self.runs_dir
343340
)
@@ -346,11 +343,7 @@ def _run_monitor(self, log_file_path: str) -> MonitorResult:
346343
f"Running monitor on {self.run_id} agent.log", destinations=["run"], _print=True
347344
)
348345
paper = paper_registry.get_paper(self.paper_id)
349-
monitor = create_monitor(
350-
monitor_type="basic",
351-
paper=paper,
352-
monitor_kwargs={},
353-
)
346+
monitor = self.monitor_config.build(paper=paper)
354347
monitor_result = monitor.check_log(log_file_path)
355348
return monitor_result
356349

project/paperbench/paperbench/scripts/run_monitor.py

Lines changed: 59 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import argparse
1+
from __future__ import annotations
2+
23
import asyncio
34
import datetime
45
import json
@@ -11,20 +12,46 @@
1112
import structlog.stdlib
1213
from tqdm.asyncio import tqdm_asyncio
1314

14-
from paperbench.monitor.create_monitor import create_monitor
15+
import chz
16+
from paperbench.monitor.monitor import BasicMonitor, Monitor
1517
from paperbench.paper_registry import paper_registry
1618

1719
logger = structlog.stdlib.get_logger(component=__name__)
1820

1921

22+
def _describe_monitor_config(monitor_config: Monitor.Config) -> str:
23+
return f"{monitor_config.__class__.__module__}.{monitor_config.__class__.__qualname__}"
24+
25+
26+
@chz.chz
27+
class MonitorCLIArgs:
28+
"""Monitor agent logs for violations."""
29+
30+
logs_dir: Path = chz.field(
31+
doc="Directory containing multiple run groups.",
32+
)
33+
run_groups: list[str] = chz.field(
34+
default_factory=list,
35+
doc="List of run group IDs to monitor.",
36+
)
37+
monitor_config: Monitor.Config = chz.field(
38+
default_factory=BasicMonitor.Config,
39+
doc="Specify the monitor to use (default: BasicMonitor).",
40+
)
41+
out_dir: Path | None = chz.field(
42+
default=None,
43+
doc="Directory to save the monitor results JSON file (default: current directory).",
44+
)
45+
46+
2047
def get_paper_id_from_run_id(run_id: str) -> str:
2148
"""Extract paper ID from run ID (e.g. 'rice_508398cb-0825-4bf0-b647-a9200ac03d21' -> 'rice')"""
2249
return run_id.split("_")[0]
2350

2451

2552
async def monitor_single_log(
2653
run_dir: Path,
27-
monitor_type: str,
54+
monitor_config: Monitor.Config,
2855
) -> dict[str, Any] | None:
2956
"""
3057
Monitor a single run's log with the specified monitor.
@@ -67,22 +94,24 @@ async def monitor_single_log(
6794
logger.warning(f"Log file not found at {log_file}")
6895
return None
6996

70-
logger.info(f"Running monitor on agent.log from {run_id}")
97+
monitor_config_payload = monitor_config.model_dump(mode="json")
98+
logger.info(
99+
f"Running monitor on agent.log from {run_id}",
100+
monitor=_describe_monitor_config(monitor_config),
101+
monitor_config_json=json.dumps(monitor_config_payload, indent=2),
102+
)
71103

72104
# Create monitor
73105
paper = paper_registry.get_paper(paper_id)
74-
monitor = create_monitor(
75-
monitor_type=monitor_type,
76-
paper=paper,
77-
monitor_kwargs={},
78-
)
106+
monitor = monitor_config.build(paper=paper)
79107

80108
# Run monitor on the log file
81109
result = await asyncio.to_thread(monitor.check_log, log_file.as_posix())
82110

83111
return {
84112
"run_group_id": run_dir.parent.name,
85-
"monitor_type": monitor_type,
113+
"monitor_type": _describe_monitor_config(monitor_config),
114+
"monitor_config": monitor_config_payload,
86115
"paper_id": paper_id,
87116
"log_file": str(log_file),
88117
"run_id": run_id,
@@ -103,7 +132,7 @@ async def monitor_single_log(
103132

104133
async def monitor_run_group(
105134
group_dir: Path,
106-
monitor_type: str,
135+
monitor_config: Monitor.Config,
107136
) -> list[dict[str, Any] | None]:
108137
"""Monitor all runs in a run group directory."""
109138
run_group_id = group_dir.name
@@ -115,7 +144,7 @@ async def monitor_run_group(
115144
tasks = [
116145
monitor_single_log(
117146
run_dir=run_dir,
118-
monitor_type=monitor_type,
147+
monitor_config=monitor_config,
119148
)
120149
for run_dir in run_dirs
121150
]
@@ -126,7 +155,7 @@ async def monitor_run_group(
126155

127156
async def monitor_multiple_run_groups(
128157
logs_dir: Path,
129-
monitor_type: str,
158+
monitor_config: Monitor.Config,
130159
run_groups: list[str] | None = None,
131160
) -> dict[str, Any] | None:
132161
"""Run monitor on multiple run groups that are in a directory of run groups."""
@@ -154,7 +183,7 @@ async def monitor_multiple_run_groups(
154183
tasks = [
155184
monitor_run_group(
156185
group_dir=logs_dir / run_group_id,
157-
monitor_type=monitor_type,
186+
monitor_config=monitor_config,
158187
)
159188
for run_group_id in run_groups
160189
]
@@ -169,10 +198,13 @@ async def monitor_multiple_run_groups(
169198
flagged_results = [result for result in all_results if len(result["results"]["violations"]) > 0]
170199
other_results = [result for result in all_results if len(result["results"]["violations"]) == 0]
171200

201+
monitor_config_payload = monitor_config.model_dump(mode="json")
202+
172203
# Create final output with results and summary
173204
return {
174205
"timestamp": datetime.datetime.now().isoformat(),
175-
"monitor_type": monitor_type,
206+
"monitor_type": _describe_monitor_config(monitor_config),
207+
"monitor_config": monitor_config_payload,
176208
"logs_dir": str(logs_dir.absolute()),
177209
"run_groups": run_groups,
178210
"total_runs": len(all_results),
@@ -184,21 +216,23 @@ async def monitor_multiple_run_groups(
184216

185217

186218
async def main(
187-
monitor_type: str,
188219
logs_dir: Path,
220+
monitor_config: Monitor.Config,
189221
run_groups: list[str] | None = None,
190222
out_dir: Path | None = None,
191223
) -> None:
192224
"""
193225
Main function to run the monitor on a directory of logs.
194226
"""
195227

228+
monitor_config = monitor_config.model_copy()
229+
196230
if out_dir:
197231
out_dir.mkdir(parents=True, exist_ok=True)
198232

199233
results = await monitor_multiple_run_groups(
200234
logs_dir=logs_dir,
201-
monitor_type=monitor_type,
235+
monitor_config=monitor_config,
202236
run_groups=run_groups,
203237
)
204238

@@ -211,41 +245,14 @@ async def main(
211245
logger.info(f"All monitor results written to {output_file}")
212246

213247

214-
if __name__ == "__main__":
215-
parser = argparse.ArgumentParser(description="Monitor agent logs for violations.")
216-
parser.add_argument(
217-
"--logs-dir",
218-
type=Path,
219-
help="Directory containing multiple run groups.",
220-
required=True,
221-
)
222-
parser.add_argument(
223-
"--run-groups",
224-
nargs="+",
225-
help="List of run group IDs to monitor.",
226-
required=False,
227-
)
228-
parser.add_argument(
229-
"-m",
230-
"--monitor",
231-
choices=["basic"],
232-
default="basic",
233-
help="Specify the monitor to use (default: basic).",
234-
)
235-
parser.add_argument(
236-
"--out-dir",
237-
type=Path,
238-
help="Directory to save the monitor results JSON file (default: current directory).",
239-
required=False,
248+
async def _run_from_cli(args: MonitorCLIArgs) -> None:
249+
await main(
250+
logs_dir=args.logs_dir,
251+
monitor_config=args.monitor_config,
252+
run_groups=args.run_groups or None,
253+
out_dir=args.out_dir,
240254
)
241255

242-
args = parser.parse_args()
243256

244-
asyncio.run(
245-
main(
246-
monitor_type=args.monitor,
247-
logs_dir=args.logs_dir,
248-
run_groups=args.run_groups,
249-
out_dir=args.out_dir,
250-
)
251-
)
257+
if __name__ == "__main__":
258+
asyncio.run(chz.nested_entrypoint(_run_from_cli))

0 commit comments

Comments
 (0)