lemon07r
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/cli/eval.go‎
Lines changed: 393 additions & 135 deletions b/‎internal/cli/eval.go‎
Lines changed: 393 additions & 135 deletions
diff --git a/‎internal/cli/eval_output_test.go‎
Lines changed: 69 additions & 33 deletions b/‎internal/cli/eval_output_test.go‎
Lines changed: 69 additions & 33 deletions
diff --git a/‎internal/cli/eval_prompt_test.go‎
Lines changed: 130 additions & 2 deletions b/‎internal/cli/eval_prompt_test.go‎
Lines changed: 130 additions & 2 deletions
diff --git a/‎internal/config/config.go‎
Lines changed: 2 additions & 1 deletion b/‎internal/config/config.go‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎internal/config/config_test.go‎
Lines changed: 16 additions & 1 deletion b/‎internal/config/config_test.go‎
Lines changed: 16 additions & 1 deletion
@@ -217,7 +217,7 @@ See [docs/CONFIGURATION.md#agent-configuration](docs/CONFIGURATION.md#agent-conf
 
 > **Workspace isolation:** During `sanity eval`, each agent runs in an isolated temporary workspace under `/tmp` rather than inside `eval-results/`. This prevents agents from reading other eval results, sibling task solutions, or their own `agent.log`. After the agent finishes, files are copied back to `eval-results/` for validation. Combined with the bubblewrap sandbox (which uses `--tmpfs /tmp`), agents have zero visibility into other evaluations.
 
-> **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot directories, add them to `sanity.toml` under `[sandbox] writable_dirs`. Use `--no-sandbox` to disable.
+> **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot writable paths, use `[sandbox] writable_dirs`; for sensitive readable paths to mask, use `[sandbox] readable_denylist`. Use `--no-sandbox` to disable.
 
 > **Legacy mode:** Prior to v1.6.0, a bug caused hidden tests to be included in the workspace during `sanity eval`, making them visible to agents. The `--legacy` flag reproduces this behavior so that older evaluation runs can be fairly compared or resumed. When `--legacy` is active, hidden test files are written to the workspace at init time (instead of being overlaid just before validation), and the hidden-test overlay step is skipped. Use this flag when resuming runs that were originally executed with the buggy behavior.
 
 
@@ -14,24 +14,30 @@ func TestGenerateLeaderboardSubmissionIncludesRunMetadata(t *testing.T) {
 	t.Parallel()
 
 	summary := EvalSummary{
-		Agent:              "codex",
-		Model:              "gpt-5",
-		Timestamp:          "2026-02-22T010203",
-		PassRate:           50.0,
-		WeightedPassRate:   49.0,
-		Passed:             13,
-		Failed:             13,
-		Total:              26,
-		WeightedScore:      10.5,
-		MaxPossibleScore:   20.5,
-		Timeout:            600,
-		Parallel:           4,
-		UseMCPTools:        false,
-		DisableMCP:         false,
-		Sandbox:            false,
-		Legacy:             false,
-		QuotaAffectedTasks: 0,
-		TotalQuotaRetries:  0,
+		Agent:                           "codex",
+		Model:                           "gpt-5",
+		Timestamp:                       "2026-02-22T010203",
+		PassRate:                        50.0,
+		WeightedPassRate:                49.0,
+		Passed:                          13,
+		Failed:                          13,
+		Total:                           26,
+		WeightedScore:                   10.5,
+		MaxPossibleScore:                20.5,
+		Timeout:                         600,
+		Parallel:                        4,
+		UseMCPTools:                     false,
+		DisableMCP:                      false,
+		Sandbox:                         false,
+		Legacy:                          false,
+		QuotaAffectedTasks:              0,
+		TotalQuotaRetries:               0,
+		TotalSelfTestCommands:           17,
+		TotalToolchainInstallAttempts:   2,
+		TotalOutOfWorkspaceReadAttempts: 3,
+		TasksWithSelfTesting:            9,
+		TasksWithToolchainInstall:       1,
+		TasksWithOutOfWorkspaceReads:    2,
 		ByLanguage: map[string]EvalAggregate{
 			"go": {Passed: 3, Failed: 3, Total: 6, PassRate: 50.0},
 		},
@@ -51,6 +57,24 @@ func TestGenerateLeaderboardSubmissionIncludesRunMetadata(t *testing.T) {
 	if submission.TotalQuotaRetries != 0 {
 		t.Fatalf("total_quota_retries = %d, want 0", submission.TotalQuotaRetries)
 	}
+	if submission.TotalSelfTestCommands != 17 {
+		t.Fatalf("total_self_test_commands = %d, want 17", submission.TotalSelfTestCommands)
+	}
+	if submission.TotalToolchainInstallAttempts != 2 {
+		t.Fatalf("total_toolchain_install_attempts = %d, want 2", submission.TotalToolchainInstallAttempts)
+	}
+	if submission.TotalOutOfWorkspaceReadAttempts != 3 {
+		t.Fatalf("total_out_of_workspace_read_attempts = %d, want 3", submission.TotalOutOfWorkspaceReadAttempts)
+	}
+	if submission.TasksWithSelfTesting != 9 {
+		t.Fatalf("tasks_with_self_testing = %d, want 9", submission.TasksWithSelfTesting)
+	}
+	if submission.TasksWithToolchainInstall != 1 {
+		t.Fatalf("tasks_with_toolchain_install = %d, want 1", submission.TasksWithToolchainInstall)
+	}
+	if submission.TasksWithOutOfWorkspaceReads != 2 {
+		t.Fatalf("tasks_with_out_of_workspace_reads = %d, want 2", submission.TasksWithOutOfWorkspaceReads)
+	}
 }
 
 func TestRunConfigMarshalIncludesFalseFlags(t *testing.T) {
@@ -92,21 +116,27 @@ func TestEvalSummaryMarshalIncludesZeroAuditFields(t *testing.T) {
 	t.Parallel()
 
 	summary := EvalSummary{
-		Agent:              "codex",
-		Timestamp:          "2026-02-22T010203",
-		Timeout:            600,
-		Parallel:           1,
-		Results:            []EvalResult{},
-		Passed:             0,
-		Failed:             0,
-		Total:              0,
-		PassRate:           0,
-		UseMCPTools:        false,
-		DisableMCP:         false,
-		Sandbox:            false,
-		Legacy:             false,
-		QuotaAffectedTasks: 0,
-		TotalQuotaRetries:  0,
+		Agent:                           "codex",
+		Timestamp:                       "2026-02-22T010203",
+		Timeout:                         600,
+		Parallel:                        1,
+		Results:                         []EvalResult{},
+		Passed:                          0,
+		Failed:                          0,
+		Total:                           0,
+		PassRate:                        0,
+		UseMCPTools:                     false,
+		DisableMCP:                      false,
+		Sandbox:                         false,
+		Legacy:                          false,
+		QuotaAffectedTasks:              0,
+		TotalQuotaRetries:               0,
+		TotalSelfTestCommands:           0,
+		TotalToolchainInstallAttempts:   0,
+		TotalOutOfWorkspaceReadAttempts: 0,
+		TasksWithSelfTesting:            0,
+		TasksWithToolchainInstall:       0,
+		TasksWithOutOfWorkspaceReads:    0,
 	}
 
 	data, err := json.Marshal(summary)
@@ -124,6 +154,12 @@ func TestEvalSummaryMarshalIncludesZeroAuditFields(t *testing.T) {
 		`"legacy":false`,
 		`"quota_affected_tasks":0`,
 		`"total_quota_retries":0`,
+		`"total_self_test_commands":0`,
+		`"total_toolchain_install_attempts":0`,
+		`"total_out_of_workspace_read_attempts":0`,
+		`"tasks_with_self_testing":0`,
+		`"tasks_with_toolchain_install":0`,
+		`"tasks_with_out_of_workspace_reads":0`,
 	} {
 		if !strings.Contains(got, field) {
 			t.Fatalf("expected summary json to include %s, got: %s", field, got)
 
@@ -38,11 +38,20 @@ func TestBuildAgentPromptIncludesKeyInfo(t *testing.T) {
 		"Difficulty:",
 		"Stub/solution files: demo.go",
 		"Test files:          demo_test.go",
+		"You may run local tests/commands in the workspace while iterating.",
 	} {
 		if !strings.Contains(prompt, s) {
 			t.Fatalf("prompt missing %q\n\nPrompt:\n%s", s, prompt)
 		}
 	}
+	for _, forbidden := range []string{
+		"You do NOT need to run tests yourself.",
+		"Do NOT search for or install language toolchains/SDKs.",
+	} {
+		if strings.Contains(prompt, forbidden) {
+			t.Fatalf("prompt should not include %q\n\nPrompt:\n%s", forbidden, prompt)
+		}
+	}
 
 	if strings.Contains(prompt, ".txt") {
 		t.Fatalf("prompt should not include .txt filenames\n\nPrompt:\n%s", prompt)
@@ -882,6 +891,17 @@ func TestIsInfraFailure(t *testing.T) {
 			writeFiles:  true,
 			wantFailure: false,
 		},
+		{
+			name:        "only harness timeout footer",
+			logContent:  "\n\nHARNESS: agent timed out (attempt=1 timeout_seconds=240.000 duration_seconds=240.000)\n",
+			wantFailure: true,
+		},
+		{
+			name:        "harness timeout footer but files written",
+			logContent:  "\n\nHARNESS: agent timed out (attempt=1 timeout_seconds=240.000 duration_seconds=240.000)\n",
+			writeFiles:  true,
+			wantFailure: false,
+		},
 		{
 			name:          "empty log with only agent.log in workspace (harness-created)",
 			logContent:    "",
@@ -935,7 +955,7 @@ func TestBuildSandboxArgs(t *testing.T) {
 	t.Parallel()
 
 	workspaceDir := t.TempDir()
-	args := buildSandboxArgs(workspaceDir, nil)
+	args := buildSandboxArgs(workspaceDir, nil, nil)
 
 	// Verify required arguments are present.
 	assertContainsArg := func(flag, value string) {
@@ -999,7 +1019,7 @@ func TestWrapCommandWithSandbox(t *testing.T) {
 	cmd := buildAgentCommand(ctx, agentCfg, "test prompt", "", "", false, "test")
 	cmd.Dir = workspaceDir
 
-	wrapped := wrapCommandWithSandbox(ctx, cmd, nil)
+	wrapped := wrapCommandWithSandbox(ctx, cmd, nil, nil)
 
 	// The wrapped command should use bwrap.
 	if !strings.HasSuffix(wrapped.Path, "bwrap") {
@@ -1037,3 +1057,111 @@ func TestWrapCommandWithSandbox(t *testing.T) {
 		t.Error("expected environment to be preserved in wrapped command")
 	}
 }
+
+func TestBuildSandboxArgsMasksDenylistedDirs(t *testing.T) {
+	t.Parallel()
+
+	workspaceDir := t.TempDir()
+	denyDir := filepath.Join(t.TempDir(), "tasks")
+	if err := os.MkdirAll(denyDir, 0o755); err != nil {
+		t.Fatalf("mkdir deny dir: %v", err)
+	}
+
+	args := buildSandboxArgs(workspaceDir, nil, []string{denyDir, filepath.Join(t.TempDir(), "missing")})
+
+	foundMask := false
+	for i, arg := range args {
+		if arg == "--tmpfs" && i+1 < len(args) && args[i+1] == denyDir {
+			foundMask = true
+			break
+		}
+	}
+	if !foundMask {
+		t.Fatalf("expected denylisted directory %s to be masked via --tmpfs", denyDir)
+	}
+}
+
+func TestResolveSandboxDenylistPaths(t *testing.T) {
+	origDir, _ := os.Getwd()
+	repoRoot := t.TempDir()
+	if err := os.Chdir(repoRoot); err != nil {
+		t.Fatalf("chdir: %v", err)
+	}
+	defer func() { _ = os.Chdir(origDir) }()
+
+	absolutePath := filepath.Join(t.TempDir(), "absolute-secret")
+	got := resolveSandboxDenylistPaths([]string{"custom-dir", absolutePath}, "")
+
+	want := []string{
+		filepath.Join(repoRoot, "tasks"),
+		filepath.Join(repoRoot, "eval-results"),
+		filepath.Join(repoRoot, "sessions"),
+		filepath.Join(repoRoot, "custom-dir"),
+		absolutePath,
+	}
+
+	for _, expected := range want {
+		found := false
+		for _, path := range got {
+			if path == expected {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("expected denylist to include %s, got %v", expected, got)
+		}
+	}
+}
+
+func TestResolveSandboxDenylistPathsIncludesOutputDir(t *testing.T) {
+	t.Parallel()
+
+	origDir, _ := os.Getwd()
+	repoRoot := t.TempDir()
+	if err := os.Chdir(repoRoot); err != nil {
+		t.Fatalf("chdir: %v", err)
+	}
+	defer func() { _ = os.Chdir(origDir) }()
+
+	outputDir := filepath.Join(t.TempDir(), "custom-output")
+	got := resolveSandboxDenylistPaths(nil, outputDir)
+
+	found := false
+	for _, path := range got {
+		if path == outputDir {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatalf("expected denylist to include output dir %s, got %v", outputDir, got)
+	}
+}
+
+func TestParseAgentBehaviorMetrics(t *testing.T) {
+	t.Parallel()
+
+	logPath := filepath.Join(t.TempDir(), "agent.log")
+	content := strings.Join([]string{
+		"$ go test ./...",
+		"$ cargo test",
+		"$ curl -sL https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz | tar xJ",
+		"$ find / -name zig -type f 2>/dev/null | head -5",
+		"/home/user/project/eval-results/old-run",
+	}, "\n")
+	if err := os.WriteFile(logPath, []byte(content), 0o644); err != nil {
+		t.Fatalf("write log: %v", err)
+	}
+
+	metrics := parseAgentBehaviorMetrics(logPath)
+	if metrics.SelfTestCommands != 2 {
+		t.Fatalf("self test commands = %d, want 2", metrics.SelfTestCommands)
+	}
+	if metrics.ToolchainInstallAttempts != 1 {
+		t.Fatalf("toolchain install attempts = %d, want 1", metrics.ToolchainInstallAttempts)
+	}
+	if metrics.OutOfWorkspaceReads != 2 {
+		t.Fatalf("out-of-workspace reads = %d, want 2", metrics.OutOfWorkspaceReads)
+	}
+}
@@ -174,7 +174,8 @@ type HarnessConfig struct {
 
 // SandboxConfig contains bubblewrap sandbox settings.
 type SandboxConfig struct {
-	WritableDirs []string `toml:"writable_dirs"` // Additional $HOME-relative dirs to mount writable
+	WritableDirs     []string `toml:"writable_dirs"`     // Additional $HOME-relative dirs to mount writable
+	ReadableDenylist []string `toml:"readable_denylist"` // Repo-relative or absolute paths to hide from agents
 }
 
 // DockerConfig contains Docker-related settings.
 
@@ -22,6 +22,9 @@ func TestDefault(t *testing.T) {
 	if Default.Docker.AutoPull != true {
 		t.Error("default auto pull should be true")
 	}
+	if len(Default.Sandbox.ReadableDenylist) != 0 {
+		t.Errorf("default readable denylist = %v, want empty", Default.Sandbox.ReadableDenylist)
+	}
 }
 
 func TestLoadNoFile(t *testing.T) {
@@ -59,7 +62,11 @@ max_attempts = 10
 [docker]
 go_image = "custom-go:latest"
 auto_pull = false
-`
+
+[sandbox]
+writable_dirs = ["go"]
+readable_denylist = ["tasks", "/tmp/secret"]
+	`
 	if err := os.WriteFile(cfgPath, []byte(content), 0644); err != nil {
 		t.Fatalf("writing config: %v", err)
 	}
@@ -84,6 +91,14 @@ auto_pull = false
 	if cfg.Docker.AutoPull != false {
 		t.Error("auto pull should be false")
 	}
+	if len(cfg.Sandbox.WritableDirs) != 1 || cfg.Sandbox.WritableDirs[0] != "go" {
+		t.Errorf("sandbox writable dirs = %v, want [go]", cfg.Sandbox.WritableDirs)
+	}
+	if len(cfg.Sandbox.ReadableDenylist) != 2 ||
+		cfg.Sandbox.ReadableDenylist[0] != "tasks" ||
+		cfg.Sandbox.ReadableDenylist[1] != "/tmp/secret" {
+		t.Errorf("sandbox readable denylist = %v, want [tasks /tmp/secret]", cfg.Sandbox.ReadableDenylist)
+	}
 }
 
 func TestLoadMissingExplicitFile(t *testing.T) {
Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,8 @@ type HarnessConfig struct {`
`174`	`174`
`175`	`175`	`// SandboxConfig contains bubblewrap sandbox settings.`
`176`	`176`	`type SandboxConfig struct {`
`177`		- WritableDirs []string `toml:"writable_dirs"` // Additional $HOME-relative dirs to mount writable
	`177`	+ WritableDirs []string `toml:"writable_dirs"` // Additional $HOME-relative dirs to mount writable
	`178`	+ ReadableDenylist []string `toml:"readable_denylist"` // Repo-relative or absolute paths to hide from agents
`178`	`179`	`}`
`179`	`180`
`180`	`181`	`// DockerConfig contains Docker-related settings.`