Skip to content

Commit 0dab2c5

Browse files
committed
feat: Standardize evaluation output and logging with HARNESS footers, increase default timeout, update task validation commands, and bump GitHub Actions versions.
1 parent a914cce commit 0dab2c5

File tree

16 files changed

+324
-52
lines changed

16 files changed

+324
-52
lines changed

.github/workflows/docker.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333

3434
steps:
3535
- name: Checkout repository
36-
uses: actions/checkout@v4
36+
uses: actions/checkout@v6
3737

3838
- name: Log in to Container Registry
3939
uses: docker/login-action@v3
@@ -44,12 +44,12 @@ jobs:
4444

4545
- name: Extract metadata
4646
id: meta
47-
uses: docker/metadata-action@v5
47+
uses: docker/metadata-action@v6
4848
with:
4949
images: ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ matrix.image }}
5050

5151
- name: Build and push
52-
uses: docker/build-push-action@v5
52+
uses: docker/build-push-action@v6
5353
with:
5454
context: .
5555
file: ${{ matrix.dockerfile }}

.github/workflows/release.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ jobs:
1212
test:
1313
runs-on: ubuntu-latest
1414
steps:
15-
- uses: actions/checkout@v4
15+
- uses: actions/checkout@v6
1616

1717
- name: Set up Go
18-
uses: actions/setup-go@v5
18+
uses: actions/setup-go@v6
1919
with:
2020
go-version: '1.25.x'
2121

@@ -31,17 +31,17 @@ jobs:
3131
lint:
3232
runs-on: ubuntu-latest
3333
steps:
34-
- uses: actions/checkout@v4
34+
- uses: actions/checkout@v6
3535

3636
- name: Set up Go
37-
uses: actions/setup-go@v5
37+
uses: actions/setup-go@v6
3838
with:
3939
go-version: '1.25.x'
4040

4141
# Build golangci-lint from source to match our Go version
4242
# The pre-built binaries are compiled with Go 1.24 and can't lint Go 1.25 code
4343
- name: Install golangci-lint from source
44-
run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.7.2
44+
run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.10.1
4545

4646
- name: Run golangci-lint
4747
run: golangci-lint run --timeout=5m
@@ -50,12 +50,12 @@ jobs:
5050
needs: [test, lint]
5151
runs-on: ubuntu-latest
5252
steps:
53-
- uses: actions/checkout@v4
53+
- uses: actions/checkout@v6
5454
with:
5555
fetch-depth: 0
5656

5757
- name: Set up Go
58-
uses: actions/setup-go@v5
58+
uses: actions/setup-go@v6
5959
with:
6060
go-version: '1.25.x'
6161

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ eval-results/<agent>-<timestamp>/
255255
├── submission.json # Leaderboard format
256256
├── run-config.json # Config for resume capability
257257
└── <task>/
258-
├── agent.log # Agent output during task execution
259-
└── validation.log # Test runner output from validation
258+
├── agent.log # Agent output during task execution (includes HARNESS timeout footer)
259+
└── validation.log # Test runner output + HARNESS validation footer (always non-empty)
260260
```
261261

262262
**Resume interrupted evals:** If interrupted (CTRL+C), the harness saves partial results and prints a resume command. Use `./sanity eval --resume <dir>` to continue.

docs/SCORING.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,10 @@ eval-results/<agent>-<timestamp>/
144144
├── attestation.json # BLAKE3 hashes for verification
145145
├── report.md # Human-readable Markdown report
146146
├── submission.json # Compact format for leaderboard
147+
├── run-config.json # Original run configuration (resume + audit)
147148
└── <lang>-<slug>/
148-
└── agent.log # Agent output (preserved even if workspace cleaned)
149+
├── agent.log # Agent output (includes HARNESS timeout footer on agent timeout)
150+
└── validation.log # Validation output (always includes HARNESS footer)
149151
```
150152

151153
### summary.json Schema
@@ -198,6 +200,11 @@ eval-results/<agent>-<timestamp>/
198200
}
199201
```
200202

203+
Notes:
204+
- `timeout`, `parallel`, `use_mcp_tools`, `disable_mcp`, `sandbox`, `legacy`,
205+
`quota_affected_tasks`, and `total_quota_retries` are always emitted.
206+
- Per-task `results[]` include explicit retry/infra metadata fields.
207+
201208
### attestation.json Schema
202209

203210
```json
@@ -261,6 +268,12 @@ Optimized for leaderboard submissions:
261268
}
262269
```
263270

271+
Notes:
272+
- `submission.json` includes run metadata and audit counters:
273+
`timeout`, `parallel`, `quota_affected_tasks`, and `total_quota_retries`.
274+
- Configuration booleans (`use_mcp_tools`, `disable_mcp`, `sandbox`, `legacy`)
275+
are always emitted as explicit booleans.
276+
264277
### report.md Format
265278

266279
The Markdown report includes:

internal/cli/eval.go

Lines changed: 87 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,8 @@ type EvalResult struct {
142142
Tier string `json:"tier,omitempty"`
143143
Difficulty string `json:"difficulty,omitempty"`
144144
Passed bool `json:"passed"`
145-
AgentTimedOut bool `json:"agent_timed_out,omitempty"`
146-
Status task.ResultStatus `json:"status,omitempty"`
145+
AgentTimedOut bool `json:"agent_timed_out"`
146+
Status task.ResultStatus `json:"status"`
147147
Attempts int `json:"attempts"`
148148
Duration float64 `json:"duration_seconds"`
149149
AgentTime float64 `json:"agent_duration_seconds,omitempty"`
@@ -152,9 +152,9 @@ type EvalResult struct {
152152
Error string `json:"error,omitempty"`
153153
Weight float64 `json:"weight,omitempty"`
154154
WeightedScore float64 `json:"weighted_score,omitempty"`
155-
QuotaRetries int `json:"quota_retries,omitempty"`
156-
QuotaExhausted bool `json:"quota_exhausted,omitempty"`
157-
InfraFailure bool `json:"infra_failure,omitempty"`
155+
QuotaRetries int `json:"quota_retries"`
156+
QuotaExhausted bool `json:"quota_exhausted"`
157+
InfraFailure bool `json:"infra_failure"`
158158
WorkspaceDir string `json:"-"` // Not serialized, used for cleanup
159159
}
160160

@@ -177,7 +177,8 @@ type EvalSummary struct {
177177
Timestamp string `json:"timestamp"`
178178
Tier string `json:"tier,omitempty"`
179179
Difficulty string `json:"difficulty,omitempty"`
180-
Parallel int `json:"parallel,omitempty"`
180+
Timeout int `json:"timeout"`
181+
Parallel int `json:"parallel"`
181182
Results []EvalResult `json:"results"`
182183
Passed int `json:"passed"`
183184
Failed int `json:"failed"`
@@ -194,12 +195,12 @@ type EvalSummary struct {
194195
ByLanguage map[string]EvalAggregate `json:"by_language,omitempty"`
195196
ByTier map[string]EvalAggregate `json:"by_tier,omitempty"`
196197
ByDifficulty map[string]EvalAggregate `json:"by_difficulty,omitempty"`
197-
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
198-
DisableMCP bool `json:"disable_mcp,omitempty"`
199-
Sandbox bool `json:"sandbox,omitempty"`
200-
Legacy bool `json:"legacy,omitempty"`
201-
QuotaAffectedTasks int `json:"quota_affected_tasks,omitempty"`
202-
TotalQuotaRetries int `json:"total_quota_retries,omitempty"`
198+
UseMCPTools bool `json:"use_mcp_tools"`
199+
DisableMCP bool `json:"disable_mcp"`
200+
Sandbox bool `json:"sandbox"`
201+
Legacy bool `json:"legacy"`
202+
QuotaAffectedTasks int `json:"quota_affected_tasks"`
203+
TotalQuotaRetries int `json:"total_quota_retries"`
203204
}
204205

205206
// RunSpec defines a single eval run's configuration.
@@ -236,11 +237,11 @@ type RunConfig struct {
236237
Tasks string `json:"tasks,omitempty"`
237238
Timeout int `json:"timeout"`
238239
Parallel int `json:"parallel"`
239-
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
240-
DisableMCP bool `json:"disable_mcp,omitempty"`
241-
NoSandbox bool `json:"no_sandbox,omitempty"`
242-
Legacy bool `json:"legacy,omitempty"`
243-
KeepWorkspaces bool `json:"keep_workspaces,omitempty"`
240+
UseMCPTools bool `json:"use_mcp_tools"`
241+
DisableMCP bool `json:"disable_mcp"`
242+
NoSandbox bool `json:"no_sandbox"`
243+
Legacy bool `json:"legacy"`
244+
KeepWorkspaces bool `json:"keep_workspaces"`
244245
TaskList []string `json:"task_list"`
245246
CreatedAt string `json:"created_at"`
246247
}
@@ -1116,6 +1117,7 @@ func evalRunSingle( //nolint:gocognit,gocyclo,maintidx
11161117
Timestamp: timestamp,
11171118
Tier: shared.Tier,
11181119
Difficulty: shared.Difficulty,
1120+
Timeout: shared.Timeout,
11191121
Parallel: parallel,
11201122
Results: results,
11211123
Passed: passed,
@@ -1349,6 +1351,11 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13491351
validationCmd = append(validationCmd, task.StripTxtExtension(filename))
13501352
}
13511353
}
1354+
effectiveValidationCmd := t.ValidationCommand()
1355+
if len(validationCmd) > 0 {
1356+
effectiveValidationCmd = validationCmd
1357+
}
1358+
validationLogPath := filepath.Join(taskOutputDir, "validation.log")
13521359

13531360
validateStart := time.Now()
13541361
session, err := r.Run(ctx, runner.RunOptions{
@@ -1361,6 +1368,8 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13611368
result.ValidateTime = time.Since(validateStart).Seconds()
13621369

13631370
if err != nil {
1371+
timedOut := strings.Contains(strings.ToLower(err.Error()), "timed out")
1372+
writeValidationLog(validationLogPath, "", effectiveValidationCmd, -1, time.Duration(result.ValidateTime*float64(time.Second)), timedOut, err)
13641373
result.Error = err.Error()
13651374
return result
13661375
}
@@ -1371,8 +1380,9 @@ func runTaskWithAgent(ctx context.Context, r *runner.Runner, t *task.Task, agent
13711380
// Save validation output to validation.log
13721381
if len(session.Attempts) > 0 {
13731382
lastAttempt := session.Attempts[len(session.Attempts)-1]
1374-
validationLogPath := filepath.Join(taskOutputDir, "validation.log")
1375-
_ = os.WriteFile(validationLogPath, []byte(lastAttempt.RawOutput), 0644)
1383+
writeValidationLog(validationLogPath, lastAttempt.RawOutput, effectiveValidationCmd, lastAttempt.ExitCode, lastAttempt.Duration, lastAttempt.ExitCode == -1, nil)
1384+
} else {
1385+
writeValidationLog(validationLogPath, "", effectiveValidationCmd, -1, 0, false, nil)
13761386
}
13771387

13781388
return result
@@ -1552,6 +1562,7 @@ func runAgentAttempt(
15521562
if errors.Is(agentCtx.Err(), context.DeadlineExceeded) {
15531563
result.timedOut = true
15541564
logger.Debug("agent timed out", "timeout", agentTimeout)
1565+
writeAgentTimeoutFooter(logFile, attempt, agentTimeout, time.Since(agentStart))
15551566
}
15561567
if agentErr != nil {
15571568
logger.Debug("agent returned error", "error", agentErr)
@@ -1581,6 +1592,47 @@ func openAgentLogFile(agentLogPath string, attempt int) *os.File {
15811592
return logFile
15821593
}
15831594

1595+
// writeAgentTimeoutFooter appends deterministic timeout evidence to the agent log.
1596+
func writeAgentTimeoutFooter(logFile *os.File, attempt int, timeout, runDuration time.Duration) {
1597+
if logFile == nil {
1598+
return
1599+
}
1600+
_, _ = fmt.Fprintf(
1601+
logFile,
1602+
"\n\nHARNESS: agent timed out (attempt=%d timeout_seconds=%.3f duration_seconds=%.3f)\n",
1603+
attempt+1,
1604+
timeout.Seconds(),
1605+
runDuration.Seconds(),
1606+
)
1607+
_ = logFile.Sync()
1608+
}
1609+
1610+
// writeValidationLog persists validation output with a machine-readable footer.
1611+
func writeValidationLog(path, rawOutput string, command []string, exitCode int, duration time.Duration, timedOut bool, runErr error) {
1612+
var sb strings.Builder
1613+
if rawOutput != "" {
1614+
sb.WriteString(rawOutput)
1615+
if !strings.HasSuffix(rawOutput, "\n") {
1616+
sb.WriteString("\n")
1617+
}
1618+
sb.WriteString("\n")
1619+
}
1620+
1621+
fmt.Fprintf(
1622+
&sb,
1623+
"HARNESS: validation command=%q exit_code=%d duration_seconds=%.3f timed_out=%t\n",
1624+
strings.Join(command, " "),
1625+
exitCode,
1626+
duration.Seconds(),
1627+
timedOut,
1628+
)
1629+
if runErr != nil {
1630+
fmt.Fprintf(&sb, "HARNESS: validation run_error=%q\n", runErr.Error())
1631+
}
1632+
1633+
_ = os.WriteFile(path, []byte(sb.String()), 0o644)
1634+
}
1635+
15841636
// toolchainInfo returns a human-readable toolchain description for the given language.
15851637
func toolchainInfo(lang task.Language) string {
15861638
switch lang {
@@ -2298,10 +2350,14 @@ type LeaderboardSubmission struct {
22982350
ResultsHash string `json:"results_hash"`
22992351

23002352
// Configuration
2301-
UseMCPTools bool `json:"use_mcp_tools,omitempty"`
2302-
DisableMCP bool `json:"disable_mcp,omitempty"`
2303-
Sandbox bool `json:"sandbox,omitempty"`
2304-
Legacy bool `json:"legacy,omitempty"`
2353+
Timeout int `json:"timeout"`
2354+
Parallel int `json:"parallel"`
2355+
UseMCPTools bool `json:"use_mcp_tools"`
2356+
DisableMCP bool `json:"disable_mcp"`
2357+
Sandbox bool `json:"sandbox"`
2358+
Legacy bool `json:"legacy"`
2359+
QuotaAffectedTasks int `json:"quota_affected_tasks"`
2360+
TotalQuotaRetries int `json:"total_quota_retries"`
23052361
}
23062362

23072363
// LeaderboardLanguageStats contains per-language metrics for the leaderboard.
@@ -2329,6 +2385,14 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
23292385
IntegrityViolations: summary.IntegrityViolations,
23302386
TotalDurationSec: summary.Duration,
23312387
AgentDurationSec: summary.AgentTime,
2388+
Timeout: summary.Timeout,
2389+
Parallel: summary.Parallel,
2390+
UseMCPTools: summary.UseMCPTools,
2391+
DisableMCP: summary.DisableMCP,
2392+
Sandbox: summary.Sandbox,
2393+
Legacy: summary.Legacy,
2394+
QuotaAffectedTasks: summary.QuotaAffectedTasks,
2395+
TotalQuotaRetries: summary.TotalQuotaRetries,
23322396
ByLanguage: make(map[string]LeaderboardLanguageStats),
23332397
}
23342398

@@ -2340,12 +2404,6 @@ func generateLeaderboardSubmission(summary EvalSummary, attestation *EvalAttesta
23402404
submission.ResultsHash = attestation.Integrity.ResultsHash
23412405
}
23422406

2343-
// Add configuration flags
2344-
submission.UseMCPTools = summary.UseMCPTools
2345-
submission.DisableMCP = summary.DisableMCP
2346-
submission.Sandbox = summary.Sandbox
2347-
submission.Legacy = summary.Legacy
2348-
23492407
// Convert language stats
23502408
for lang, agg := range summary.ByLanguage {
23512409
submission.ByLanguage[lang] = LeaderboardLanguageStats{

0 commit comments

Comments
 (0)