fix: timeout regression (30s→120s) and add --legacy flag for pre-v1.6.0 compatibility

lemon07r · lemon07r · commit 5cdcec45fca5 · 2026-02-20T12:45:50.000-05:00
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 
 A lightweight evaluation harness for coding agents that runs high-signal, compact but challenging problems in isolated Docker containers. Evaluate agents across 26 tasks in 6 languages with weighted scoring, integrity verification, and detailed reporting.
 
-> **Note:** All evaluation results obtained before version `v1.6.0` cannot be compared to results obtained on or after `v1.6.0` due to a critical fix in how hidden tests are handled.
+> **Note:** All evaluation results obtained before version `v1.6.0` cannot be compared to results obtained on or after `v1.6.0` due to a critical fix in how hidden tests are handled. Version `v1.6.1` fixes a timeout regression (default was incorrectly 30s instead of 120s) and adds the `--legacy` flag.
 
 <!-- Add demo GIF/screenshot here -->
 
@@ -104,6 +104,7 @@ make build    # Build the CLI
 ./sanity eval --agent opencode --disable-mcp          # Disable MCP tools / currently only supported for opencode
 ./sanity eval --agent opencode --keep-workspaces      # Keep workspaces for debugging
 ./sanity eval --agent gemini --no-sandbox             # Disable bubblewrap sandbox
+./sanity eval --agent gemini --legacy                 # Legacy mode (hidden tests visible to agent)
 ./sanity eval --resume ./eval-results/gemini-...      # Resume interrupted eval
 ```
 
@@ -216,6 +217,8 @@ See [docs/CONFIGURATION.md#agent-configuration](docs/CONFIGURATION.md#agent-conf
 
 > **Sandbox note:** `sanity eval` runs agents inside a [bubblewrap](https://github.com/containers/bubblewrap) sandbox where `$HOME` is read-only. All dot-directories under `$HOME` (e.g. `~/.my-agent/`) are automatically writable, so most agents work out of the box. For non-dot directories, add them to `sanity.toml` under `[sandbox] writable_dirs`. Use `--no-sandbox` to disable.
 
+> **Legacy mode:** Prior to v1.6.0, a bug caused hidden tests to be included in the workspace during `sanity eval`, making them visible to agents. The `--legacy` flag reproduces this behavior so that older evaluation runs can be fairly compared or resumed. When `--legacy` is active, hidden test files are written to the workspace at init time (instead of being overlaid just before validation), and the hidden-test overlay step is skipped. Use this flag when resuming runs that were originally executed with the buggy behavior.
+
 ## How It Works
 
 1. **Container Strategy**: Containers run `sleep infinity`; commands execute via `docker exec` for fast reuse
diff --git a/internal/cli/eval.go b/internal/cli/eval.go
@@ -47,6 +47,7 @@ var (
 	evalUseMCPTools    bool
 	evalDisableMCP     bool
 	evalNoSandbox      bool
+	evalLegacy         bool
 	evalSandboxActive  bool
 	evalResume         string
 )
@@ -184,6 +185,7 @@ type EvalSummary struct {
 	UseMCPTools         bool                     `json:"use_mcp_tools,omitempty"`
 	DisableMCP          bool                     `json:"disable_mcp,omitempty"`
 	Sandbox             bool                     `json:"sandbox,omitempty"`
+	Legacy              bool                     `json:"legacy,omitempty"`
 	QuotaAffectedTasks  int                      `json:"quota_affected_tasks,omitempty"`
 	TotalQuotaRetries   int                      `json:"total_quota_retries,omitempty"`
 }
@@ -202,6 +204,7 @@ type RunConfig struct {
 	UseMCPTools    bool     `json:"use_mcp_tools,omitempty"`
 	DisableMCP     bool     `json:"disable_mcp,omitempty"`
 	NoSandbox      bool     `json:"no_sandbox,omitempty"`
+	Legacy         bool     `json:"legacy,omitempty"`
 	KeepWorkspaces bool     `json:"keep_workspaces,omitempty"`
 	TaskList       []string `json:"task_list"`
 	CreatedAt      string   `json:"created_at"`
@@ -323,6 +326,11 @@ Examples:
 		}
 		defer func() { _ = r.Close() }()
 
+		if evalLegacy {
+			r.LegacyHiddenTests = true
+			logger.Info("legacy mode enabled: hidden tests exposed to agent (pre-v1.6.0 behavior)")
+		}
+
 		// If the user specified another selector, default tier should not hide tasks.
 		tierChanged := cmd.Flags().Changed("tier")
 		if !tierChanged && (evalLang != "" || evalTasks != "" || evalDifficulty != "") {
@@ -926,6 +934,7 @@ Examples:
 			UseMCPTools:         evalUseMCPTools,
 			DisableMCP:          evalDisableMCP,
 			Sandbox:             evalSandboxActive,
+			Legacy:              evalLegacy,
 			QuotaAffectedTasks:  quotaAffectedTasks,
 			TotalQuotaRetries:   totalQuotaRetries,
 		}
@@ -1072,9 +1081,12 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
 	}
 
 	// Add hidden tests (not shown to the agent) before validation.
-	if err := writeTaskFilesToWorkspace(loader, t, workspaceDir, t.HiddenTestFiles()); err != nil {
-		result.Error = fmt.Sprintf("writing hidden tests: %v", err)
-		return result
+	// In legacy mode, hidden tests are already in the workspace from init.
+	if !evalLegacy {
+		if err := writeTaskFilesToWorkspace(loader, t, workspaceDir, t.HiddenTestFiles()); err != nil {
+			result.Error = fmt.Sprintf("writing hidden tests: %v", err)
+			return result
+		}
 	}
 
 	// Run sanity harness to validate.
@@ -1982,6 +1994,7 @@ type LeaderboardSubmission struct {
 	UseMCPTools bool `json:"use_mcp_tools,omitempty"`
 	DisableMCP  bool `json:"disable_mcp,omitempty"`
 	Sandbox     bool `json:"sandbox,omitempty"`
+	Legacy      bool `json:"legacy,omitempty"`
 }
 
 // LeaderboardLanguageStats contains per-language metrics for the leaderboard.
@@ -2026,6 +2039,7 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
 	submission.UseMCPTools = summary.UseMCPTools
 	submission.DisableMCP = summary.DisableMCP
 	submission.Sandbox = summary.Sandbox
+	submission.Legacy = summary.Legacy
 
 	// Convert language stats
 	for lang, agg := range summary.ByLanguage {
@@ -2078,6 +2092,9 @@ func writeReportSummary(sb *strings.Builder, summary EvalSummary) {
 	if summary.Sandbox {
 		sb.WriteString("| Sandbox | Yes |\n")
 	}
+	if summary.Legacy {
+		sb.WriteString("| Legacy Mode | Yes |\n")
+	}
 	fmt.Fprintf(sb, "| Timestamp | %s |\n", summary.Timestamp)
 	fmt.Fprintf(sb, "| Pass Rate | **%.1f%%** (%d/%d) |\n", summary.PassRate, summary.Passed, summary.Total)
 	fmt.Fprintf(sb, "| Weighted Pass Rate | **%.1f%%** |\n", summary.WeightedPassRate)
@@ -2315,6 +2332,7 @@ func saveRunConfig(outputDir string, allTasks []*task.Task) error {
 		UseMCPTools:    evalUseMCPTools,
 		DisableMCP:     evalDisableMCP,
 		NoSandbox:      evalNoSandbox,
+		Legacy:         evalLegacy,
 		KeepWorkspaces: evalKeepWorkspaces,
 		TaskList:       taskList,
 		CreatedAt:      time.Now().Format(time.RFC3339),
@@ -2357,6 +2375,7 @@ func applyRunConfig(runCfg *RunConfig) {
 	evalUseMCPTools = runCfg.UseMCPTools
 	evalDisableMCP = runCfg.DisableMCP
 	evalNoSandbox = runCfg.NoSandbox
+	evalLegacy = runCfg.Legacy
 	evalKeepWorkspaces = runCfg.KeepWorkspaces
 }
 
@@ -2563,5 +2582,6 @@ func init() {
 	evalCmd.Flags().BoolVar(&evalUseMCPTools, "use-mcp-tools", false, "inject MCP tool usage instructions into agent prompt")
 	evalCmd.Flags().BoolVar(&evalDisableMCP, "disable-mcp", false, "disable MCP tools for agents that support it (currently: opencode)")
 	evalCmd.Flags().BoolVar(&evalNoSandbox, "no-sandbox", false, "disable bubblewrap sandbox for agent processes")
+	evalCmd.Flags().BoolVar(&evalLegacy, "legacy", false, "expose hidden tests to agent during workspace init (pre-v1.6.0 behavior)")
 	evalCmd.Flags().StringVar(&evalResume, "resume", "", "resume eval from existing output directory")
 }
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -191,7 +191,7 @@ type DockerConfig struct {
 var Default = Config{
 	Harness: HarnessConfig{
 		SessionDir:     "./sessions",
-		DefaultTimeout: 30,
+		DefaultTimeout: 120,
 		MaxAttempts:    5,
 		OutputFormat:   "all",
 	},
diff --git a/internal/runner/runner.go b/internal/runner/runner.go
@@ -21,10 +21,11 @@ import (
 
 // Runner orchestrates task execution.
 type Runner struct {
-	cfg        *config.Config
-	taskLoader *task.Loader
-	docker     *DockerClient
-	logger     *slog.Logger
+	cfg                *config.Config
+	taskLoader         *task.Loader
+	docker             *DockerClient
+	logger             *slog.Logger
+	LegacyHiddenTests  bool // When true, include hidden tests in workspace init (pre-v1.6.0 behavior)
 }
 
 // NewRunner creates a new runner.
@@ -412,7 +413,11 @@ func (r *Runner) ensureWorkspace(t *task.Task, dir string) error {
 		return nil
 	}
 
-	return r.copyTaskFiles(t, dir, t.VisibleFiles())
+	files := t.VisibleFiles()
+	if r.LegacyHiddenTests {
+		files = t.AllFiles()
+	}
+	return r.copyTaskFiles(t, dir, files)
 }
 
 // captureWorkspace reads the workspace files into the session.
@@ -500,7 +505,11 @@ func (r *Runner) InitWorkspaceForTask(t *task.Task, outputDir string) error {
 		return fmt.Errorf("directory is not empty: %s", absDir)
 	}
 
-	return r.copyTaskFiles(t, absDir, t.VisibleFiles())
+	files := t.VisibleFiles()
+	if r.LegacyHiddenTests {
+		files = t.AllFiles()
+	}
+	return r.copyTaskFiles(t, absDir, files)
 }
 
 // ListTasks returns all available tasks.