Skip to content

Commit 5cdcec4

Browse files
committed
fix: timeout regression (30s→120s) and add --legacy flag for pre-v1.6.0 compatibility
1 parent 6362fe6 commit 5cdcec4

File tree

4 files changed

+43
-11
lines changed

4 files changed

+43
-11
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
A lightweight evaluation harness for coding agents that runs high-signal, compact but challenging problems in isolated Docker containers. Evaluate agents across 26 tasks in 6 languages with weighted scoring, integrity verification, and detailed reporting.
1111

12-
> **Note:** All evaluation results obtained before version `v1.6.0` cannot be compared to results obtained on or after `v1.6.0` due to a critical fix in how hidden tests are handled.
12+
> **Note:** All evaluation results obtained before version `v1.6.0` cannot be compared to results obtained on or after `v1.6.0` due to a critical fix in how hidden tests are handled. Version `v1.6.1` fixes a timeout regression (default was incorrectly 30s instead of 120s) and adds the `--legacy` flag.
1313
1414
<!-- Add demo GIF/screenshot here -->
1515

@@ -104,6 +104,7 @@ make build # Build the CLI
104104
./sanity eval --agent opencode --disable-mcp # Disable MCP tools / currently only supported for opencode
105105
./sanity eval --agent opencode --keep-workspaces # Keep workspaces for debugging
106106
./sanity eval --agent gemini --no-sandbox # Disable bubblewrap sandbox
107+
./sanity eval --agent gemini --legacy # Legacy mode (hidden tests visible to agent)
107108
./sanity eval --resume ./eval-results/gemini-... # Resume interrupted eval
108109
```
109110

@@ -216,6 +217,8 @@ See [docs/CONFIGURATION.md#agent-configuration](docs/CONFIGURATION.md#agent-conf
216217

217218
> **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot directories, add them to `sanity.toml` under `[sandbox] writable_dirs`. Use `--no-sandbox` to disable.
218219
220+
> **Legacy mode:** Prior to v1.6.0, a bug caused hidden tests to be included in the workspace during `sanity eval`, making them visible to agents. The `--legacy` flag reproduces this behavior so that older evaluation runs can be fairly compared or resumed. When `--legacy` is active, hidden test files are written to the workspace at init time (instead of being overlaid just before validation), and the hidden-test overlay step is skipped. Use this flag when resuming runs that were originally executed with the buggy behavior.
221+
219222
## How It Works
220223

221224
1. **Container Strategy**: Containers run `sleep infinity`; commands execute via `docker exec` for fast reuse

internal/cli/eval.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ var (
4747
evalUseMCPTools bool
4848
evalDisableMCP bool
4949
evalNoSandbox bool
50+
evalLegacy bool
5051
evalSandboxActive bool
5152
evalResume string
5253
)
@@ -184,6 +185,7 @@ type EvalSummary struct {
184185
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
185186
DisableMCP bool `json:"disable_mcp,omitempty"`
186187
Sandbox bool `json:"sandbox,omitempty"`
188+
Legacy bool `json:"legacy,omitempty"`
187189
QuotaAffectedTasks int `json:"quota_affected_tasks,omitempty"`
188190
TotalQuotaRetries int `json:"total_quota_retries,omitempty"`
189191
}
@@ -202,6 +204,7 @@ type RunConfig struct {
202204
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
203205
DisableMCP bool `json:"disable_mcp,omitempty"`
204206
NoSandbox bool `json:"no_sandbox,omitempty"`
207+
Legacy bool `json:"legacy,omitempty"`
205208
KeepWorkspaces bool `json:"keep_workspaces,omitempty"`
206209
TaskList []string `json:"task_list"`
207210
CreatedAt string `json:"created_at"`
@@ -323,6 +326,11 @@ Examples:
323326
}
324327
defer func() { _ = r.Close() }()
325328

329+
if evalLegacy {
330+
r.LegacyHiddenTests = true
331+
logger.Info("legacy mode enabled: hidden tests exposed to agent (pre-v1.6.0 behavior)")
332+
}
333+
326334
// If the user specified another selector, default tier should not hide tasks.
327335
tierChanged := cmd.Flags().Changed("tier")
328336
if !tierChanged && (evalLang != "" || evalTasks != "" || evalDifficulty != "") {
@@ -926,6 +934,7 @@ Examples:
926934
UseMCPTools: evalUseMCPTools,
927935
DisableMCP: evalDisableMCP,
928936
Sandbox: evalSandboxActive,
937+
Legacy: evalLegacy,
929938
QuotaAffectedTasks: quotaAffectedTasks,
930939
TotalQuotaRetries: totalQuotaRetries,
931940
}
@@ -1072,9 +1081,12 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
10721081
}
10731082

10741083
// Add hidden tests (not shown to the agent) before validation.
1075-
if err := writeTaskFilesToWorkspace(loader, t, workspaceDir, t.HiddenTestFiles()); err != nil {
1076-
result.Error = fmt.Sprintf("writing hidden tests: %v", err)
1077-
return result
1084+
// In legacy mode, hidden tests are already in the workspace from init.
1085+
if !evalLegacy {
1086+
if err := writeTaskFilesToWorkspace(loader, t, workspaceDir, t.HiddenTestFiles()); err != nil {
1087+
result.Error = fmt.Sprintf("writing hidden tests: %v", err)
1088+
return result
1089+
}
10781090
}
10791091

10801092
// Run sanity harness to validate.
@@ -1982,6 +1994,7 @@ type LeaderboardSubmission struct {
19821994
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
19831995
DisableMCP bool `json:"disable_mcp,omitempty"`
19841996
Sandbox bool `json:"sandbox,omitempty"`
1997+
Legacy bool `json:"legacy,omitempty"`
19851998
}
19861999

19872000
// LeaderboardLanguageStats contains per-language metrics for the leaderboard.
@@ -2026,6 +2039,7 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
20262039
submission.UseMCPTools = summary.UseMCPTools
20272040
submission.DisableMCP = summary.DisableMCP
20282041
submission.Sandbox = summary.Sandbox
2042+
submission.Legacy = summary.Legacy
20292043

20302044
// Convert language stats
20312045
for lang, agg := range summary.ByLanguage {
@@ -2078,6 +2092,9 @@ func writeReportSummary(sb *strings.Builder, summary EvalSummary) {
20782092
if summary.Sandbox {
20792093
sb.WriteString("| Sandbox | Yes |\n")
20802094
}
2095+
if summary.Legacy {
2096+
sb.WriteString("| Legacy Mode | Yes |\n")
2097+
}
20812098
fmt.Fprintf(sb, "| Timestamp | %s |\n", summary.Timestamp)
20822099
fmt.Fprintf(sb, "| Pass Rate | **%.1f%%** (%d/%d) |\n", summary.PassRate, summary.Passed, summary.Total)
20832100
fmt.Fprintf(sb, "| Weighted Pass Rate | **%.1f%%** |\n", summary.WeightedPassRate)
@@ -2315,6 +2332,7 @@ func saveRunConfig(outputDir string, allTasks []*task.Task) error {
23152332
UseMCPTools: evalUseMCPTools,
23162333
DisableMCP: evalDisableMCP,
23172334
NoSandbox: evalNoSandbox,
2335+
Legacy: evalLegacy,
23182336
KeepWorkspaces: evalKeepWorkspaces,
23192337
TaskList: taskList,
23202338
CreatedAt: time.Now().Format(time.RFC3339),
@@ -2357,6 +2375,7 @@ func applyRunConfig(runCfg *RunConfig) {
23572375
evalUseMCPTools = runCfg.UseMCPTools
23582376
evalDisableMCP = runCfg.DisableMCP
23592377
evalNoSandbox = runCfg.NoSandbox
2378+
evalLegacy = runCfg.Legacy
23602379
evalKeepWorkspaces = runCfg.KeepWorkspaces
23612380
}
23622381

@@ -2563,5 +2582,6 @@ func init() {
25632582
evalCmd.Flags().BoolVar(&evalUseMCPTools, "use-mcp-tools", false, "inject MCP tool usage instructions into agent prompt")
25642583
evalCmd.Flags().BoolVar(&evalDisableMCP, "disable-mcp", false, "disable MCP tools for agents that support it (currently: opencode)")
25652584
evalCmd.Flags().BoolVar(&evalNoSandbox, "no-sandbox", false, "disable bubblewrap sandbox for agent processes")
2585+
evalCmd.Flags().BoolVar(&evalLegacy, "legacy", false, "expose hidden tests to agent during workspace init (pre-v1.6.0 behavior)")
25662586
evalCmd.Flags().StringVar(&evalResume, "resume", "", "resume eval from existing output directory")
25672587
}

internal/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ type DockerConfig struct {
191191
var Default = Config{
192192
Harness: HarnessConfig{
193193
SessionDir: "./sessions",
194-
DefaultTimeout: 30,
194+
DefaultTimeout: 120,
195195
MaxAttempts: 5,
196196
OutputFormat: "all",
197197
},

internal/runner/runner.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ import (
2121

2222
// Runner orchestrates task execution.
2323
type Runner struct {
24-
cfg *config.Config
25-
taskLoader *task.Loader
26-
docker *DockerClient
27-
logger *slog.Logger
24+
cfg *config.Config
25+
taskLoader *task.Loader
26+
docker *DockerClient
27+
logger *slog.Logger
28+
LegacyHiddenTests bool // When true, include hidden tests in workspace init (pre-v1.6.0 behavior)
2829
}
2930

3031
// NewRunner creates a new runner.
@@ -412,7 +413,11 @@ func (r *Runner) ensureWorkspace(t *task.Task, dir string) error {
412413
return nil
413414
}
414415

415-
return r.copyTaskFiles(t, dir, t.VisibleFiles())
416+
files := t.VisibleFiles()
417+
if r.LegacyHiddenTests {
418+
files = t.AllFiles()
419+
}
420+
return r.copyTaskFiles(t, dir, files)
416421
}
417422

418423
// captureWorkspace reads the workspace files into the session.
@@ -500,7 +505,11 @@ func (r *Runner) InitWorkspaceForTask(t *task.Task, outputDir string) error {
500505
return fmt.Errorf("directory is not empty: %s", absDir)
501506
}
502507

503-
return r.copyTaskFiles(t, absDir, t.VisibleFiles())
508+
files := t.VisibleFiles()
509+
if r.LegacyHiddenTests {
510+
files = t.AllFiles()
511+
}
512+
return r.copyTaskFiles(t, absDir, files)
504513
}
505514

506515
// ListTasks returns all available tasks.

0 commit comments

Comments
 (0)