@@ -142,8 +142,8 @@ type EvalResult struct {
142142 Tier string `json:"tier,omitempty"`
143143 Difficulty string `json:"difficulty,omitempty"`
144144 Passed bool `json:"passed"`
145- AgentTimedOut bool `json:"agent_timed_out,omitempty "`
146- Status task.ResultStatus `json:"status,omitempty "`
145+ AgentTimedOut bool `json:"agent_timed_out"`
146+ Status task.ResultStatus `json:"status"`
147147 Attempts int `json:"attempts"`
148148 Duration float64 `json:"duration_seconds"`
149149 AgentTime float64 `json:"agent_duration_seconds,omitempty"`
@@ -152,9 +152,9 @@ type EvalResult struct {
152152 Error string `json:"error,omitempty"`
153153 Weight float64 `json:"weight,omitempty"`
154154 WeightedScore float64 `json:"weighted_score,omitempty"`
155- QuotaRetries int `json:"quota_retries,omitempty "`
156- QuotaExhausted bool `json:"quota_exhausted,omitempty "`
157- InfraFailure bool `json:"infra_failure,omitempty "`
155+ QuotaRetries int `json:"quota_retries"`
156+ QuotaExhausted bool `json:"quota_exhausted"`
157+ InfraFailure bool `json:"infra_failure"`
158158 WorkspaceDir string `json:"-"` // Not serialized, used for cleanup
159159}
160160
@@ -177,7 +177,8 @@ type EvalSummary struct {
177177 Timestamp string `json:"timestamp"`
178178 Tier string `json:"tier,omitempty"`
179179 Difficulty string `json:"difficulty,omitempty"`
180- Parallel int `json:"parallel,omitempty"`
180+ Timeout int `json:"timeout"`
181+ Parallel int `json:"parallel"`
181182 Results []EvalResult `json:"results"`
182183 Passed int `json:"passed"`
183184 Failed int `json:"failed"`
@@ -194,12 +195,12 @@ type EvalSummary struct {
194195 ByLanguage map [string ]EvalAggregate `json:"by_language,omitempty"`
195196 ByTier map [string ]EvalAggregate `json:"by_tier,omitempty"`
196197 ByDifficulty map [string ]EvalAggregate `json:"by_difficulty,omitempty"`
197- UseMCPTools bool `json:"use_mcp_tools,omitempty "`
198- DisableMCP bool `json:"disable_mcp,omitempty "`
199- Sandbox bool `json:"sandbox,omitempty "`
200- Legacy bool `json:"legacy,omitempty "`
201- QuotaAffectedTasks int `json:"quota_affected_tasks,omitempty "`
202- TotalQuotaRetries int `json:"total_quota_retries,omitempty "`
198+ UseMCPTools bool `json:"use_mcp_tools"`
199+ DisableMCP bool `json:"disable_mcp"`
200+ Sandbox bool `json:"sandbox"`
201+ Legacy bool `json:"legacy"`
202+ QuotaAffectedTasks int `json:"quota_affected_tasks"`
203+ TotalQuotaRetries int `json:"total_quota_retries"`
203204}
204205
205206// RunSpec defines a single eval run's configuration.
@@ -236,11 +237,11 @@ type RunConfig struct {
236237 Tasks string `json:"tasks,omitempty"`
237238 Timeout int `json:"timeout"`
238239 Parallel int `json:"parallel"`
239- UseMCPTools bool `json:"use_mcp_tools,omitempty "`
240- DisableMCP bool `json:"disable_mcp,omitempty "`
241- NoSandbox bool `json:"no_sandbox,omitempty "`
242- Legacy bool `json:"legacy,omitempty "`
243- KeepWorkspaces bool `json:"keep_workspaces,omitempty "`
240+ UseMCPTools bool `json:"use_mcp_tools"`
241+ DisableMCP bool `json:"disable_mcp"`
242+ NoSandbox bool `json:"no_sandbox"`
243+ Legacy bool `json:"legacy"`
244+ KeepWorkspaces bool `json:"keep_workspaces"`
244245 TaskList []string `json:"task_list"`
245246 CreatedAt string `json:"created_at"`
246247}
@@ -1116,6 +1117,7 @@ func evalRunSingle( //nolint:gocognit,gocyclo,maintidx
11161117 Timestamp : timestamp ,
11171118 Tier : shared .Tier ,
11181119 Difficulty : shared .Difficulty ,
1120+ Timeout : shared .Timeout ,
11191121 Parallel : parallel ,
11201122 Results : results ,
11211123 Passed : passed ,
@@ -1349,6 +1351,11 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13491351 validationCmd = append (validationCmd , task .StripTxtExtension (filename ))
13501352 }
13511353 }
1354+ effectiveValidationCmd := t .ValidationCommand ()
1355+ if len (validationCmd ) > 0 {
1356+ effectiveValidationCmd = validationCmd
1357+ }
1358+ validationLogPath := filepath .Join (taskOutputDir , "validation.log" )
13521359
13531360 validateStart := time .Now ()
13541361 session , err := r .Run (ctx , runner.RunOptions {
@@ -1361,6 +1368,8 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13611368 result .ValidateTime = time .Since (validateStart ).Seconds ()
13621369
13631370 if err != nil {
1371+ timedOut := strings .Contains (strings .ToLower (err .Error ()), "timed out" )
1372+ writeValidationLog (validationLogPath , "" , effectiveValidationCmd , - 1 , time .Duration (result .ValidateTime * float64 (time .Second )), timedOut , err )
13641373 result .Error = err .Error ()
13651374 return result
13661375 }
@@ -1371,8 +1380,9 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13711380 // Save validation output to validation.log
13721381 if len (session .Attempts ) > 0 {
13731382 lastAttempt := session .Attempts [len (session .Attempts )- 1 ]
1374- validationLogPath := filepath .Join (taskOutputDir , "validation.log" )
1375- _ = os .WriteFile (validationLogPath , []byte (lastAttempt .RawOutput ), 0644 )
1383+ writeValidationLog (validationLogPath , lastAttempt .RawOutput , effectiveValidationCmd , lastAttempt .ExitCode , lastAttempt .Duration , lastAttempt .ExitCode == - 1 , nil )
1384+ } else {
1385+ writeValidationLog (validationLogPath , "" , effectiveValidationCmd , - 1 , 0 , false , nil )
13761386 }
13771387
13781388 return result
@@ -1552,6 +1562,7 @@ func runAgentAttempt(
15521562 if errors .Is (agentCtx .Err (), context .DeadlineExceeded ) {
15531563 result .timedOut = true
15541564 logger .Debug ("agent timed out" , "timeout" , agentTimeout )
1565+ writeAgentTimeoutFooter (logFile , attempt , agentTimeout , time .Since (agentStart ))
15551566 }
15561567 if agentErr != nil {
15571568 logger .Debug ("agent returned error" , "error" , agentErr )
@@ -1581,6 +1592,47 @@ func openAgentLogFile(agentLogPath string, attempt int) *os.File {
15811592 return logFile
15821593}
15831594
1595+ // writeAgentTimeoutFooter appends deterministic timeout evidence to the agent log.
1596+ func writeAgentTimeoutFooter (logFile * os.File , attempt int , timeout , runDuration time.Duration ) {
1597+ if logFile == nil {
1598+ return
1599+ }
1600+ _ , _ = fmt .Fprintf (
1601+ logFile ,
1602+ "\n \n HARNESS: agent timed out (attempt=%d timeout_seconds=%.3f duration_seconds=%.3f)\n " ,
1603+ attempt + 1 ,
1604+ timeout .Seconds (),
1605+ runDuration .Seconds (),
1606+ )
1607+ _ = logFile .Sync ()
1608+ }
1609+
1610+ // writeValidationLog persists validation output with a machine-readable footer.
1611+ func writeValidationLog (path , rawOutput string , command []string , exitCode int , duration time.Duration , timedOut bool , runErr error ) {
1612+ var sb strings.Builder
1613+ if rawOutput != "" {
1614+ sb .WriteString (rawOutput )
1615+ if ! strings .HasSuffix (rawOutput , "\n " ) {
1616+ sb .WriteString ("\n " )
1617+ }
1618+ sb .WriteString ("\n " )
1619+ }
1620+
1621+ fmt .Fprintf (
1622+ & sb ,
1623+ "HARNESS: validation command=%q exit_code=%d duration_seconds=%.3f timed_out=%t\n " ,
1624+ strings .Join (command , " " ),
1625+ exitCode ,
1626+ duration .Seconds (),
1627+ timedOut ,
1628+ )
1629+ if runErr != nil {
1630+ fmt .Fprintf (& sb , "HARNESS: validation run_error=%q\n " , runErr .Error ())
1631+ }
1632+
1633+ _ = os .WriteFile (path , []byte (sb .String ()), 0o644 )
1634+ }
1635+
15841636// toolchainInfo returns a human-readable toolchain description for the given language.
15851637func toolchainInfo (lang task.Language ) string {
15861638 switch lang {
@@ -2298,10 +2350,14 @@ type LeaderboardSubmission struct {
22982350 ResultsHash string `json:"results_hash"`
22992351
23002352 // Configuration
2301- UseMCPTools bool `json:"use_mcp_tools,omitempty"`
2302- DisableMCP bool `json:"disable_mcp,omitempty"`
2303- Sandbox bool `json:"sandbox,omitempty"`
2304- Legacy bool `json:"legacy,omitempty"`
2353+ Timeout int `json:"timeout"`
2354+ Parallel int `json:"parallel"`
2355+ UseMCPTools bool `json:"use_mcp_tools"`
2356+ DisableMCP bool `json:"disable_mcp"`
2357+ Sandbox bool `json:"sandbox"`
2358+ Legacy bool `json:"legacy"`
2359+ QuotaAffectedTasks int `json:"quota_affected_tasks"`
2360+ TotalQuotaRetries int `json:"total_quota_retries"`
23052361}
23062362
23072363// LeaderboardLanguageStats contains per-language metrics for the leaderboard.
@@ -2329,6 +2385,14 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
23292385 IntegrityViolations : summary .IntegrityViolations ,
23302386 TotalDurationSec : summary .Duration ,
23312387 AgentDurationSec : summary .AgentTime ,
2388+ Timeout : summary .Timeout ,
2389+ Parallel : summary .Parallel ,
2390+ UseMCPTools : summary .UseMCPTools ,
2391+ DisableMCP : summary .DisableMCP ,
2392+ Sandbox : summary .Sandbox ,
2393+ Legacy : summary .Legacy ,
2394+ QuotaAffectedTasks : summary .QuotaAffectedTasks ,
2395+ TotalQuotaRetries : summary .TotalQuotaRetries ,
23322396 ByLanguage : make (map [string ]LeaderboardLanguageStats ),
23332397 }
23342398
@@ -2340,12 +2404,6 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
23402404 submission .ResultsHash = attestation .Integrity .ResultsHash
23412405 }
23422406
2343- // Add configuration flags
2344- submission .UseMCPTools = summary .UseMCPTools
2345- submission .DisableMCP = summary .DisableMCP
2346- submission .Sandbox = summary .Sandbox
2347- submission .Legacy = summary .Legacy
2348-
23492407 // Convert language stats
23502408 for lang , agg := range summary .ByLanguage {
23512409 submission .ByLanguage [lang ] = LeaderboardLanguageStats {
0 commit comments