Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,30 @@ bun start --skills --model "claude-sonnet-4-5"
BRAINTRUST_API_KEY=sk-... bun report:braintrust
```

### Agent Evaluations (Multi-Agent)

```bash
# Run agent evals with Claude Code
bun agent:claude
bun agent:claude --eval add-auth --debug

# Run agent evals with Codex
bun agent:codex
bun agent:codex --eval add-auth --debug

# Multi-trial (3 runs per eval, pass@k metrics)
bun agent:claude --runs 3
bun agent:codex --runs 3

# With skills or MCP
bun agent:claude --skills
bun agent:claude --skills --mcp

# Cross-agent leaderboard export
bun export:leaderboard
bun export:leaderboard --since 2026-03-20
```

## Project Structure & Module Organization
`src/index.ts` wires providers, runners, reporters, and every folder under `src/evals`. Keep each evaluation in its own directory with `PROMPT.md`, `graders.ts`, and any fixtures it needs. Use descriptive, numeric-free slugs like `src/evals/new-eval`. Runner logic lives in `src/runners`, shared provider clients in `src/providers`, scoring helpers in `src/scorers`, and reusable utilities in `src/utils`. Diagrams intended for contributor onboarding belong in `docs/`, while transient artifacts like `scores.json` stay gitignored at the root.

Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
"start:agent": "bun src/agent-index.ts",
"agent:claude": "bun src/agent-index.ts --agent claude-code",
"agent:claude:mcp": "bun src/agent-index.ts --agent claude-code --mcp",
"agent:codex": "bun src/agent-index.ts --agent codex",
"agent:skills": "bun src/agent-index.ts --agent claude-code --skills",
"agent:skills:mcp": "bun src/agent-index.ts --agent claude-code --skills --mcp",
"report:braintrust": "bun src/report-braintrust.ts",
"export:leaderboard": "bun src/export/leaderboard.ts",
"export": "bun export-from-db.ts",
"merge-scores": "bun src/merge-scores.ts",
"lint": "biome check .",
Expand Down
241 changes: 154 additions & 87 deletions src/agent-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
import { execSync } from 'node:child_process'
import { mkdir, writeFile } from 'node:fs/promises'
import path from 'node:path'
import { parseArgs } from 'node:util'
import Tinypool from 'tinypool'
import { parseArgs } from 'util'
import { classifyFailure } from '@/src/classifiers/failure'
import { EVALUATIONS } from '@/src/config'
import { getResults, initDB, saveError, saveResult } from '@/src/db'
import type {
Expand All @@ -25,6 +26,7 @@ import type {
Score,
} from '@/src/interfaces'
import { AGENTS, getAgentInfo, getAllAgentTypes } from '@/src/interfaces/agent'
import { summarizeTrials, type TrialResult } from '@/src/metrics/pass-at-k'
import consoleReporter from '@/src/reporters/console'
import fileReporter from '@/src/reporters/file'

Expand Down Expand Up @@ -57,6 +59,7 @@ const { values } = parseArgs({
debug: { type: 'boolean', short: 'd', default: false },
eval: { type: 'string', short: 'e' },
timeout: { type: 'string', short: 't' },
runs: { type: 'string', short: 'r' },
},
strict: true,
allowPositionals: true,
Expand All @@ -69,6 +72,7 @@ const skillsPath = values['skills-path'] || path.join(process.cwd(), '..', 'skil
const debugEnabled = values.debug
const evalFilter = values.eval
const timeoutArg = values.timeout
const runsCount = values.runs ? Number.parseInt(values.runs, 10) : 1

const normalizeEvalPath = (value: string) => {
if (value.startsWith('./')) return normalizeEvalPath(value.slice(2))
Expand Down Expand Up @@ -195,106 +199,169 @@ if (skillsEnabled) {
if (mcpEnabled) {
console.log(`MCP Server: ${mcpUrl}`)
}
console.log(`Running ${tasks.length} evaluations\n`)
if (runsCount > 1) {
console.log(`Runs per eval: ${runsCount}`)
}
console.log(
`Running ${tasks.length} evaluations${runsCount > 1 ? ` (${tasks.length * runsCount} total runs)` : ''}\n`,
)

let completed = 0
const totalRuns = tasks.length * runsCount

// Run all in parallel (with limited concurrency)
await Promise.all(
tasks.map(async (task) => {
console.log(`[start] ${task.agent} -> ${task.evaluationPath}`)

const runnerArgs: AgentRunnerArgs = {
agent: task.agent,
evalPath: task.evalPath,
debug: debugEnabled,
mcpConfig: mcpEnabled
? {
enabled: true,
serverUrl: mcpUrl,
}
: undefined,
skillsConfig: skillsEnabled
? {
enabled: true,
sourcePath: skillsPath,
evalPath: task.evaluationPath,
}
: undefined,
timeout: timeoutArg ? Number.parseInt(timeoutArg, 10) : undefined,
executablePath,
envPath: process.env.PATH,
fixturesPath: task.fixturesPath,
gradersPath: task.gradersPath,
}

try {
const result: RunnerResult = await pool.run(runnerArgs)

if (!result.ok) {
const errorMsg =
result.error instanceof Error
? result.error.message
: typeof result.error === 'object'
? JSON.stringify(result.error)
: String(result.error)
console.error(`[error] ${task.agent}: ${errorMsg}`)
const errorLabelParts: string[] = [agentInfo.label]
if (skillsEnabled) errorLabelParts.push('Skills')
if (mcpEnabled) errorLabelParts.push('MCP')
saveError(runId, {
model: task.agent,
label: errorLabelParts.join(' + '),
framework: task.framework,
category: task.category,
evaluationPath: task.evaluationPath,
error: result.error,
})
return
const trialResults: TrialResult[] = []

for (let trial = 1; trial <= runsCount; trial++) {
const trialLabel = runsCount > 1 ? ` [trial ${trial}/${runsCount}]` : ''
console.log(`[start] ${task.agent} -> ${task.evaluationPath}${trialLabel}`)

const runnerArgs: AgentRunnerArgs = {
agent: task.agent,
evalPath: task.evalPath,
debug: debugEnabled,
mcpConfig: mcpEnabled
? {
enabled: true,
serverUrl: mcpUrl,
}
: undefined,
skillsConfig: skillsEnabled
? {
enabled: true,
sourcePath: skillsPath,
evalPath: task.evaluationPath,
}
: undefined,
timeout: timeoutArg ? Number.parseInt(timeoutArg, 10) : undefined,
executablePath,
envPath: process.env.PATH,
fixturesPath: task.fixturesPath,
gradersPath: task.gradersPath,
}

const labelParts: string[] = [agentInfo.label]
if (skillsEnabled) labelParts.push('Skills')
if (mcpEnabled) labelParts.push('MCP')
const score: Score = {
model: task.agent,
label: labelParts.join(' + '),
framework: task.framework,
category: task.category,
value: result.value.score,
updatedAt: new Date().toISOString(),
}
saveResult(runId, score)
const startTime = Date.now()

try {
const result: RunnerResult = await pool.run(runnerArgs)

if (!result.ok) {
const errorMsg =
result.error instanceof Error
? result.error.message
: typeof result.error === 'object'
? JSON.stringify(result.error)
: String(result.error)
console.error(`[error] ${task.agent}${trialLabel}: ${errorMsg}`)

// Classify the failure
const failureType = classifyFailure(
{
success: false,
output: '',
duration: Date.now() - startTime,
exitCode: -1,
error: errorMsg,
},
timeoutArg ? Number.parseInt(timeoutArg, 10) : 600_000,
)

const errorLabelParts: string[] = [agentInfo.label]
if (skillsEnabled) errorLabelParts.push('Skills')
if (mcpEnabled) errorLabelParts.push('MCP')
saveError(runId, {
model: task.agent,
label: errorLabelParts.join(' + '),
framework: task.framework,
category: task.category,
evaluationPath: task.evaluationPath,
error: result.error,
trial,
failureType,
})

trialResults.push({
trial,
score: 0,
durationMs: Date.now() - startTime,
success: false,
})
continue
}

if (debugEnabled && result.value.debug && debugRunDirectory) {
const artifact: DebugArtifact = {
agent: task.agent,
const labelParts: string[] = [agentInfo.label]
if (skillsEnabled) labelParts.push('Skills')
if (mcpEnabled) labelParts.push('MCP')
const score: Score = {
model: task.agent,
label: labelParts.join(' + '),
framework: task.framework,
category: task.category,
evaluationPath: task.evaluationPath,
score: result.value.score,
prompt: result.value.debug.prompt,
response: result.value.debug.response,
graders: result.value.debug.graders,
transcript: result.value.debug.transcript,
value: result.value.score,
updatedAt: new Date().toISOString(),
durationMs: result.value.durationMs,
}
debugArtifacts.push(artifact)

// Write debug files
const evalSlug = task.variant
? `${task.evaluationPath.replace(/\//g, '__')}__${task.variant}`
: task.evaluationPath.replace(/\//g, '__')
const debugPath = path.join(debugRunDirectory, `${evalSlug}__${task.agent}.json`)
await writeFile(debugPath, JSON.stringify(result.value.debug, null, 2))

if (result.value.debug.transcript) {
const transcriptPath = path.join(debugRunDirectory, `${evalSlug}__${task.agent}.md`)
await writeFile(transcriptPath, result.value.debug.transcript)
saveResult(runId, score, task.evaluationPath)

trialResults.push({
trial,
score: result.value.score,
durationMs: result.value.durationMs ?? Date.now() - startTime,
success: result.value.score >= 0.5,
})

if (debugEnabled && result.value.debug && debugRunDirectory) {
const artifact: DebugArtifact = {
agent: task.agent,
framework: task.framework,
category: task.category,
evaluationPath: task.evaluationPath,
score: result.value.score,
prompt: result.value.debug.prompt,
response: result.value.debug.response,
graders: result.value.debug.graders,
transcript: result.value.debug.transcript,
}
debugArtifacts.push(artifact)

// Write debug files
const evalSlug = task.variant
? `${task.evaluationPath.replace(/\//g, '__')}__${task.variant}`
: task.evaluationPath.replace(/\//g, '__')
const trialSuffix = runsCount > 1 ? `__trial${trial}` : ''
const debugPath = path.join(
debugRunDirectory,
`${evalSlug}__${task.agent}${trialSuffix}.json`,
)
await writeFile(debugPath, JSON.stringify(result.value.debug, null, 2))

if (result.value.debug.transcript) {
const transcriptPath = path.join(
debugRunDirectory,
`${evalSlug}__${task.agent}${trialSuffix}.md`,
)
await writeFile(transcriptPath, result.value.debug.transcript)
}
}
} finally {
completed++
console.log(
`[done ${completed}/${totalRuns}] ${task.agent} -> ${task.evaluationPath}${trialLabel}`,
)
}
} finally {
completed++
console.log(`[done ${completed}/${tasks.length}] ${task.agent} -> ${task.evaluationPath}`)
}

// Log multi-trial summary
if (runsCount > 1 && trialResults.length > 0) {
const summary = summarizeTrials(trialResults)
console.log(
`[summary] ${task.evaluationPath}: ${summary.passed}/${summary.totalTrials} passed, ` +
`pass@1=${(summary.passAt1 * 100).toFixed(0)}%, ` +
`pass@${runsCount}=${(summary.passAtK * 100).toFixed(0)}%, ` +
`mean=${(summary.meanScore * 100).toFixed(0)}%`,
)
}
}),
)
Expand Down
Loading
Loading