Skip to content

Commit 7e2d943

Browse files
committed
Harden eval audit and sandbox
1 parent 0b2cea3 commit 7e2d943

File tree

9 files changed

+690
-175
lines changed

9 files changed

+690
-175
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ See [docs/CONFIGURATION.md#agent-configuration](docs/CONFIGURATION.md#agent-conf
217217

218218
> **Workspace isolation:** During `sanity eval`, each agent runs in an isolated temporary workspace under `/tmp` rather than inside `eval-results/`. This prevents agents from reading other eval results, sibling task solutions, or their own `agent.log`. After the agent finishes, files are copied back to `eval-results/` for validation. Combined with the bubblewrap sandbox (which uses `--tmpfs /tmp`), agents have zero visibility into other evaluations.
219219
220-
> **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot directories, add them to `sanity.toml` under `[sandbox] writable_dirs`. Use `--no-sandbox` to disable.
220+
> **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot writable paths, use `[sandbox] writable_dirs`; for sensitive readable paths to mask, use `[sandbox] readable_denylist`. Use `--no-sandbox` to disable.
221221
222222
> **Legacy mode:** Prior to v1.6.0, a bug caused hidden tests to be included in the workspace during `sanity eval`, making them visible to agents. The `--legacy` flag reproduces this behavior so that older evaluation runs can be fairly compared or resumed. When `--legacy` is active, hidden test files are written to the workspace at init time (instead of being overlaid just before validation), and the hidden-test overlay step is skipped. Use this flag when resuming runs that were originally executed with the buggy behavior.
223223

internal/cli/eval.go

Lines changed: 393 additions & 135 deletions
Large diffs are not rendered by default.

internal/cli/eval_output_test.go

Lines changed: 69 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,30 @@ func TestGenerateLeaderboardSubmissionIncludesRunMetadata(t *testing.T) {
1414
t.Parallel()
1515

1616
summary := EvalSummary{
17-
Agent: "codex",
18-
Model: "gpt-5",
19-
Timestamp: "2026-02-22T010203",
20-
PassRate: 50.0,
21-
WeightedPassRate: 49.0,
22-
Passed: 13,
23-
Failed: 13,
24-
Total: 26,
25-
WeightedScore: 10.5,
26-
MaxPossibleScore: 20.5,
27-
Timeout: 600,
28-
Parallel: 4,
29-
UseMCPTools: false,
30-
DisableMCP: false,
31-
Sandbox: false,
32-
Legacy: false,
33-
QuotaAffectedTasks: 0,
34-
TotalQuotaRetries: 0,
17+
Agent: "codex",
18+
Model: "gpt-5",
19+
Timestamp: "2026-02-22T010203",
20+
PassRate: 50.0,
21+
WeightedPassRate: 49.0,
22+
Passed: 13,
23+
Failed: 13,
24+
Total: 26,
25+
WeightedScore: 10.5,
26+
MaxPossibleScore: 20.5,
27+
Timeout: 600,
28+
Parallel: 4,
29+
UseMCPTools: false,
30+
DisableMCP: false,
31+
Sandbox: false,
32+
Legacy: false,
33+
QuotaAffectedTasks: 0,
34+
TotalQuotaRetries: 0,
35+
TotalSelfTestCommands: 17,
36+
TotalToolchainInstallAttempts: 2,
37+
TotalOutOfWorkspaceReadAttempts: 3,
38+
TasksWithSelfTesting: 9,
39+
TasksWithToolchainInstall: 1,
40+
TasksWithOutOfWorkspaceReads: 2,
3541
ByLanguage: map[string]EvalAggregate{
3642
"go": {Passed: 3, Failed: 3, Total: 6, PassRate: 50.0},
3743
},
@@ -51,6 +57,24 @@ func TestGenerateLeaderboardSubmissionIncludesRunMetadata(t *testing.T) {
5157
if submission.TotalQuotaRetries != 0 {
5258
t.Fatalf("total_quota_retries = %d, want 0", submission.TotalQuotaRetries)
5359
}
60+
if submission.TotalSelfTestCommands != 17 {
61+
t.Fatalf("total_self_test_commands = %d, want 17", submission.TotalSelfTestCommands)
62+
}
63+
if submission.TotalToolchainInstallAttempts != 2 {
64+
t.Fatalf("total_toolchain_install_attempts = %d, want 2", submission.TotalToolchainInstallAttempts)
65+
}
66+
if submission.TotalOutOfWorkspaceReadAttempts != 3 {
67+
t.Fatalf("total_out_of_workspace_read_attempts = %d, want 3", submission.TotalOutOfWorkspaceReadAttempts)
68+
}
69+
if submission.TasksWithSelfTesting != 9 {
70+
t.Fatalf("tasks_with_self_testing = %d, want 9", submission.TasksWithSelfTesting)
71+
}
72+
if submission.TasksWithToolchainInstall != 1 {
73+
t.Fatalf("tasks_with_toolchain_install = %d, want 1", submission.TasksWithToolchainInstall)
74+
}
75+
if submission.TasksWithOutOfWorkspaceReads != 2 {
76+
t.Fatalf("tasks_with_out_of_workspace_reads = %d, want 2", submission.TasksWithOutOfWorkspaceReads)
77+
}
5478
}
5579

5680
func TestRunConfigMarshalIncludesFalseFlags(t *testing.T) {
@@ -92,21 +116,27 @@ func TestEvalSummaryMarshalIncludesZeroAuditFields(t *testing.T) {
92116
t.Parallel()
93117

94118
summary := EvalSummary{
95-
Agent: "codex",
96-
Timestamp: "2026-02-22T010203",
97-
Timeout: 600,
98-
Parallel: 1,
99-
Results: []EvalResult{},
100-
Passed: 0,
101-
Failed: 0,
102-
Total: 0,
103-
PassRate: 0,
104-
UseMCPTools: false,
105-
DisableMCP: false,
106-
Sandbox: false,
107-
Legacy: false,
108-
QuotaAffectedTasks: 0,
109-
TotalQuotaRetries: 0,
119+
Agent: "codex",
120+
Timestamp: "2026-02-22T010203",
121+
Timeout: 600,
122+
Parallel: 1,
123+
Results: []EvalResult{},
124+
Passed: 0,
125+
Failed: 0,
126+
Total: 0,
127+
PassRate: 0,
128+
UseMCPTools: false,
129+
DisableMCP: false,
130+
Sandbox: false,
131+
Legacy: false,
132+
QuotaAffectedTasks: 0,
133+
TotalQuotaRetries: 0,
134+
TotalSelfTestCommands: 0,
135+
TotalToolchainInstallAttempts: 0,
136+
TotalOutOfWorkspaceReadAttempts: 0,
137+
TasksWithSelfTesting: 0,
138+
TasksWithToolchainInstall: 0,
139+
TasksWithOutOfWorkspaceReads: 0,
110140
}
111141

112142
data, err := json.Marshal(summary)
@@ -124,6 +154,12 @@ func TestEvalSummaryMarshalIncludesZeroAuditFields(t *testing.T) {
124154
`"legacy":false`,
125155
`"quota_affected_tasks":0`,
126156
`"total_quota_retries":0`,
157+
`"total_self_test_commands":0`,
158+
`"total_toolchain_install_attempts":0`,
159+
`"total_out_of_workspace_read_attempts":0`,
160+
`"tasks_with_self_testing":0`,
161+
`"tasks_with_toolchain_install":0`,
162+
`"tasks_with_out_of_workspace_reads":0`,
127163
} {
128164
if !strings.Contains(got, field) {
129165
t.Fatalf("expected summary json to include %s, got: %s", field, got)

internal/cli/eval_prompt_test.go

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,20 @@ func TestBuildAgentPromptIncludesKeyInfo(t *testing.T) {
3838
"Difficulty:",
3939
"Stub/solution files: demo.go",
4040
"Test files: demo_test.go",
41+
"You may run local tests/commands in the workspace while iterating.",
4142
} {
4243
if !strings.Contains(prompt, s) {
4344
t.Fatalf("prompt missing %q\n\nPrompt:\n%s", s, prompt)
4445
}
4546
}
47+
for _, forbidden := range []string{
48+
"You do NOT need to run tests yourself.",
49+
"Do NOT search for or install language toolchains/SDKs.",
50+
} {
51+
if strings.Contains(prompt, forbidden) {
52+
t.Fatalf("prompt should not include %q\n\nPrompt:\n%s", forbidden, prompt)
53+
}
54+
}
4655

4756
if strings.Contains(prompt, ".txt") {
4857
t.Fatalf("prompt should not include .txt filenames\n\nPrompt:\n%s", prompt)
@@ -882,6 +891,17 @@ func TestIsInfraFailure(t *testing.T) {
882891
writeFiles: true,
883892
wantFailure: false,
884893
},
894+
{
895+
name: "only harness timeout footer",
896+
logContent: "\n\nHARNESS: agent timed out (attempt=1 timeout_seconds=240.000 duration_seconds=240.000)\n",
897+
wantFailure: true,
898+
},
899+
{
900+
name: "harness timeout footer but files written",
901+
logContent: "\n\nHARNESS: agent timed out (attempt=1 timeout_seconds=240.000 duration_seconds=240.000)\n",
902+
writeFiles: true,
903+
wantFailure: false,
904+
},
885905
{
886906
name: "empty log with only agent.log in workspace (harness-created)",
887907
logContent: "",
@@ -935,7 +955,7 @@ func TestBuildSandboxArgs(t *testing.T) {
935955
t.Parallel()
936956

937957
workspaceDir := t.TempDir()
938-
args := buildSandboxArgs(workspaceDir, nil)
958+
args := buildSandboxArgs(workspaceDir, nil, nil)
939959

940960
// Verify required arguments are present.
941961
assertContainsArg := func(flag, value string) {
@@ -999,7 +1019,7 @@ func TestWrapCommandWithSandbox(t *testing.T) {
9991019
cmd := buildAgentCommand(ctx, agentCfg, "test prompt", "", "", false, "test")
10001020
cmd.Dir = workspaceDir
10011021

1002-
wrapped := wrapCommandWithSandbox(ctx, cmd, nil)
1022+
wrapped := wrapCommandWithSandbox(ctx, cmd, nil, nil)
10031023

10041024
// The wrapped command should use bwrap.
10051025
if !strings.HasSuffix(wrapped.Path, "bwrap") {
@@ -1037,3 +1057,111 @@ func TestWrapCommandWithSandbox(t *testing.T) {
10371057
t.Error("expected environment to be preserved in wrapped command")
10381058
}
10391059
}
1060+
1061+
func TestBuildSandboxArgsMasksDenylistedDirs(t *testing.T) {
1062+
t.Parallel()
1063+
1064+
workspaceDir := t.TempDir()
1065+
denyDir := filepath.Join(t.TempDir(), "tasks")
1066+
if err := os.MkdirAll(denyDir, 0o755); err != nil {
1067+
t.Fatalf("mkdir deny dir: %v", err)
1068+
}
1069+
1070+
args := buildSandboxArgs(workspaceDir, nil, []string{denyDir, filepath.Join(t.TempDir(), "missing")})
1071+
1072+
foundMask := false
1073+
for i, arg := range args {
1074+
if arg == "--tmpfs" && i+1 < len(args) && args[i+1] == denyDir {
1075+
foundMask = true
1076+
break
1077+
}
1078+
}
1079+
if !foundMask {
1080+
t.Fatalf("expected denylisted directory %s to be masked via --tmpfs", denyDir)
1081+
}
1082+
}
1083+
1084+
func TestResolveSandboxDenylistPaths(t *testing.T) {
1085+
origDir, _ := os.Getwd()
1086+
repoRoot := t.TempDir()
1087+
if err := os.Chdir(repoRoot); err != nil {
1088+
t.Fatalf("chdir: %v", err)
1089+
}
1090+
defer func() { _ = os.Chdir(origDir) }()
1091+
1092+
absolutePath := filepath.Join(t.TempDir(), "absolute-secret")
1093+
got := resolveSandboxDenylistPaths([]string{"custom-dir", absolutePath}, "")
1094+
1095+
want := []string{
1096+
filepath.Join(repoRoot, "tasks"),
1097+
filepath.Join(repoRoot, "eval-results"),
1098+
filepath.Join(repoRoot, "sessions"),
1099+
filepath.Join(repoRoot, "custom-dir"),
1100+
absolutePath,
1101+
}
1102+
1103+
for _, expected := range want {
1104+
found := false
1105+
for _, path := range got {
1106+
if path == expected {
1107+
found = true
1108+
break
1109+
}
1110+
}
1111+
if !found {
1112+
t.Fatalf("expected denylist to include %s, got %v", expected, got)
1113+
}
1114+
}
1115+
}
1116+
1117+
func TestResolveSandboxDenylistPathsIncludesOutputDir(t *testing.T) {
1118+
t.Parallel()
1119+
1120+
origDir, _ := os.Getwd()
1121+
repoRoot := t.TempDir()
1122+
if err := os.Chdir(repoRoot); err != nil {
1123+
t.Fatalf("chdir: %v", err)
1124+
}
1125+
defer func() { _ = os.Chdir(origDir) }()
1126+
1127+
outputDir := filepath.Join(t.TempDir(), "custom-output")
1128+
got := resolveSandboxDenylistPaths(nil, outputDir)
1129+
1130+
found := false
1131+
for _, path := range got {
1132+
if path == outputDir {
1133+
found = true
1134+
break
1135+
}
1136+
}
1137+
if !found {
1138+
t.Fatalf("expected denylist to include output dir %s, got %v", outputDir, got)
1139+
}
1140+
}
1141+
1142+
func TestParseAgentBehaviorMetrics(t *testing.T) {
1143+
t.Parallel()
1144+
1145+
logPath := filepath.Join(t.TempDir(), "agent.log")
1146+
content := strings.Join([]string{
1147+
"$ go test ./...",
1148+
"$ cargo test",
1149+
"$ curl -sL https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz | tar xJ",
1150+
"$ find / -name zig -type f 2>/dev/null | head -5",
1151+
"/home/user/project/eval-results/old-run",
1152+
}, "\n")
1153+
if err := os.WriteFile(logPath, []byte(content), 0o644); err != nil {
1154+
t.Fatalf("write log: %v", err)
1155+
}
1156+
1157+
metrics := parseAgentBehaviorMetrics(logPath)
1158+
if metrics.SelfTestCommands != 2 {
1159+
t.Fatalf("self test commands = %d, want 2", metrics.SelfTestCommands)
1160+
}
1161+
if metrics.ToolchainInstallAttempts != 1 {
1162+
t.Fatalf("toolchain install attempts = %d, want 1", metrics.ToolchainInstallAttempts)
1163+
}
1164+
if metrics.OutOfWorkspaceReads != 2 {
1165+
t.Fatalf("out-of-workspace reads = %d, want 2", metrics.OutOfWorkspaceReads)
1166+
}
1167+
}

internal/config/config.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ type HarnessConfig struct {
174174

175175
// SandboxConfig contains bubblewrap sandbox settings.
176176
type SandboxConfig struct {
177-
WritableDirs []string `toml:"writable_dirs"` // Additional $HOME-relative dirs to mount writable
177+
WritableDirs []string `toml:"writable_dirs"` // Additional $HOME-relative dirs to mount writable
178+
ReadableDenylist []string `toml:"readable_denylist"` // Repo-relative or absolute paths to hide from agents
178179
}
179180

180181
// DockerConfig contains Docker-related settings.

internal/config/config_test.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ func TestDefault(t *testing.T) {
2222
if Default.Docker.AutoPull != true {
2323
t.Error("default auto pull should be true")
2424
}
25+
if len(Default.Sandbox.ReadableDenylist) != 0 {
26+
t.Errorf("default readable denylist = %v, want empty", Default.Sandbox.ReadableDenylist)
27+
}
2528
}
2629

2730
func TestLoadNoFile(t *testing.T) {
@@ -59,7 +62,11 @@ max_attempts = 10
5962
[docker]
6063
go_image = "custom-go:latest"
6164
auto_pull = false
62-
`
65+
66+
[sandbox]
67+
writable_dirs = ["go"]
68+
readable_denylist = ["tasks", "/tmp/secret"]
69+
`
6370
if err := os.WriteFile(cfgPath, []byte(content), 0644); err != nil {
6471
t.Fatalf("writing config: %v", err)
6572
}
@@ -84,6 +91,14 @@ auto_pull = false
8491
if cfg.Docker.AutoPull != false {
8592
t.Error("auto pull should be false")
8693
}
94+
if len(cfg.Sandbox.WritableDirs) != 1 || cfg.Sandbox.WritableDirs[0] != "go" {
95+
t.Errorf("sandbox writable dirs = %v, want [go]", cfg.Sandbox.WritableDirs)
96+
}
97+
if len(cfg.Sandbox.ReadableDenylist) != 2 ||
98+
cfg.Sandbox.ReadableDenylist[0] != "tasks" ||
99+
cfg.Sandbox.ReadableDenylist[1] != "/tmp/secret" {
100+
t.Errorf("sandbox readable denylist = %v, want [tasks /tmp/secret]", cfg.Sandbox.ReadableDenylist)
101+
}
87102
}
88103

89104
func TestLoadMissingExplicitFile(t *testing.T) {

0 commit comments

Comments
 (0)