clerk · Railly · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -42,6 +42,30 @@ bun start --skills --model "claude-sonnet-4-5"
 BRAINTRUST_API_KEY=sk-... bun report:braintrust
 ```
 
+### Agent Evaluations (Multi-Agent)
+
+```bash
+# Run agent evals with Claude Code
+bun agent:claude
+bun agent:claude --eval add-auth --debug
+
+# Run agent evals with Codex
+bun agent:codex
+bun agent:codex --eval add-auth --debug
+
+# Multi-trial (3 runs per eval, pass@k metrics)
+bun agent:claude --runs 3
+bun agent:codex --runs 3
+
+# With skills or MCP
+bun agent:claude --skills
+bun agent:claude --skills --mcp
+
+# Cross-agent leaderboard export
+bun export:leaderboard
+bun export:leaderboard --since 2026-03-20
+```
+
 ## Project Structure & Module Organization
 `src/index.ts` wires providers, runners, reporters, and every folder under `src/evals`. Keep each evaluation in its own directory with `PROMPT.md`, `graders.ts`, and any fixtures it needs. Use descriptive, numeric-free slugs like `src/evals/new-eval`. Runner logic lives in `src/runners`, shared provider clients in `src/providers`, scoring helpers in `src/scorers`, and reusable utilities in `src/utils`. Diagrams intended for contributor onboarding belong in `docs/`, while transient artifacts like `scores.json` stay gitignored at the root.
 

diff --git a/package.json b/package.json
@@ -14,9 +14,11 @@
     "start:agent": "bun src/agent-index.ts",
     "agent:claude": "bun src/agent-index.ts --agent claude-code",
     "agent:claude:mcp": "bun src/agent-index.ts --agent claude-code --mcp",
+    "agent:codex": "bun src/agent-index.ts --agent codex",
     "agent:skills": "bun src/agent-index.ts --agent claude-code --skills",
     "agent:skills:mcp": "bun src/agent-index.ts --agent claude-code --skills --mcp",
     "report:braintrust": "bun src/report-braintrust.ts",
+    "export:leaderboard": "bun src/export/leaderboard.ts",
     "export": "bun export-from-db.ts",
     "merge-scores": "bun src/merge-scores.ts",
     "lint": "biome check .",

diff --git a/src/agent-index.ts b/src/agent-index.ts
@@ -13,8 +13,9 @@
 import { execSync } from 'node:child_process'
 import { mkdir, writeFile } from 'node:fs/promises'
 import path from 'node:path'
+import { parseArgs } from 'node:util'
 import Tinypool from 'tinypool'
-import { parseArgs } from 'util'
+import { classifyFailure } from '@/src/classifiers/failure'
 import { EVALUATIONS } from '@/src/config'
 import { getResults, initDB, saveError, saveResult } from '@/src/db'
 import type {
@@ -25,6 +26,7 @@ import type {
   Score,
 } from '@/src/interfaces'
 import { AGENTS, getAgentInfo, getAllAgentTypes } from '@/src/interfaces/agent'
+import { summarizeTrials, type TrialResult } from '@/src/metrics/pass-at-k'
 import consoleReporter from '@/src/reporters/console'
 import fileReporter from '@/src/reporters/file'
 
@@ -57,6 +59,7 @@ const { values } = parseArgs({
     debug: { type: 'boolean', short: 'd', default: false },
     eval: { type: 'string', short: 'e' },
     timeout: { type: 'string', short: 't' },
+    runs: { type: 'string', short: 'r' },
   },
   strict: true,
   allowPositionals: true,
@@ -69,6 +72,7 @@ const skillsPath = values['skills-path'] || path.join(process.cwd(), '..', 'skil
 const debugEnabled = values.debug
 const evalFilter = values.eval
 const timeoutArg = values.timeout
+const runsCount = values.runs ? Number.parseInt(values.runs, 10) : 1
 
 const normalizeEvalPath = (value: string) => {
   if (value.startsWith('./')) return normalizeEvalPath(value.slice(2))
@@ -195,106 +199,169 @@ if (skillsEnabled) {
 if (mcpEnabled) {
   console.log(`MCP Server: ${mcpUrl}`)
 }
-console.log(`Running ${tasks.length} evaluations\n`)
+if (runsCount > 1) {
+  console.log(`Runs per eval: ${runsCount}`)
+}
+console.log(
+  `Running ${tasks.length} evaluations${runsCount > 1 ? ` (${tasks.length * runsCount} total runs)` : ''}\n`,
+)
 
 let completed = 0
+const totalRuns = tasks.length * runsCount
 
 // Run all in parallel (with limited concurrency)
 await Promise.all(
   tasks.map(async (task) => {
-    console.log(`[start] ${task.agent} -> ${task.evaluationPath}`)
-
-    const runnerArgs: AgentRunnerArgs = {
-      agent: task.agent,
-      evalPath: task.evalPath,
-      debug: debugEnabled,
-      mcpConfig: mcpEnabled
-        ? {
-            enabled: true,
-            serverUrl: mcpUrl,
-          }
-        : undefined,
-      skillsConfig: skillsEnabled
-        ? {
-            enabled: true,
-            sourcePath: skillsPath,
-            evalPath: task.evaluationPath,
-          }
-        : undefined,
-      timeout: timeoutArg ? Number.parseInt(timeoutArg, 10) : undefined,
-      executablePath,
-      envPath: process.env.PATH,
-      fixturesPath: task.fixturesPath,
-      gradersPath: task.gradersPath,
-    }
-
-    try {
-      const result: RunnerResult = await pool.run(runnerArgs)
-
-      if (!result.ok) {
-        const errorMsg =
-          result.error instanceof Error
-            ? result.error.message
-            : typeof result.error === 'object'
-              ? JSON.stringify(result.error)
-              : String(result.error)
-        console.error(`[error] ${task.agent}: ${errorMsg}`)
-        const errorLabelParts: string[] = [agentInfo.label]
-        if (skillsEnabled) errorLabelParts.push('Skills')
-        if (mcpEnabled) errorLabelParts.push('MCP')
-        saveError(runId, {
-          model: task.agent,
-          label: errorLabelParts.join(' + '),
-          framework: task.framework,
-          category: task.category,
-          evaluationPath: task.evaluationPath,
-          error: result.error,
-        })
-        return
+    const trialResults: TrialResult[] = []
+
+    for (let trial = 1; trial <= runsCount; trial++) {
+      const trialLabel = runsCount > 1 ? ` [trial ${trial}/${runsCount}]` : ''
+      console.log(`[start] ${task.agent} -> ${task.evaluationPath}${trialLabel}`)
+
+      const runnerArgs: AgentRunnerArgs = {
+        agent: task.agent,
+        evalPath: task.evalPath,
+        debug: debugEnabled,
+        mcpConfig: mcpEnabled
+          ? {
+              enabled: true,
+              serverUrl: mcpUrl,
+            }
+          : undefined,
+        skillsConfig: skillsEnabled
+          ? {
+              enabled: true,
+              sourcePath: skillsPath,
+              evalPath: task.evaluationPath,
+            }
+          : undefined,
+        timeout: timeoutArg ? Number.parseInt(timeoutArg, 10) : undefined,
+        executablePath,
+        envPath: process.env.PATH,
+        fixturesPath: task.fixturesPath,
+        gradersPath: task.gradersPath,
       }
 
-      const labelParts: string[] = [agentInfo.label]
-      if (skillsEnabled) labelParts.push('Skills')
-      if (mcpEnabled) labelParts.push('MCP')
-      const score: Score = {
-        model: task.agent,
-        label: labelParts.join(' + '),
-        framework: task.framework,
-        category: task.category,
-        value: result.value.score,
-        updatedAt: new Date().toISOString(),
-      }
-      saveResult(runId, score)
+      const startTime = Date.now()
+
+      try {
+        const result: RunnerResult = await pool.run(runnerArgs)
+
+        if (!result.ok) {
+          const errorMsg =
+            result.error instanceof Error
+              ? result.error.message
+              : typeof result.error === 'object'
+                ? JSON.stringify(result.error)
+                : String(result.error)
+          console.error(`[error] ${task.agent}${trialLabel}: ${errorMsg}`)
+
+          // Classify the failure
+          const failureType = classifyFailure(
+            {
+              success: false,
+              output: '',
+              duration: Date.now() - startTime,
+              exitCode: -1,
+              error: errorMsg,
+            },
+            timeoutArg ? Number.parseInt(timeoutArg, 10) : 600_000,
+          )
+
+          const errorLabelParts: string[] = [agentInfo.label]
+          if (skillsEnabled) errorLabelParts.push('Skills')
+          if (mcpEnabled) errorLabelParts.push('MCP')
+          saveError(runId, {
+            model: task.agent,
+            label: errorLabelParts.join(' + '),
+            framework: task.framework,
+            category: task.category,
+            evaluationPath: task.evaluationPath,
+            error: result.error,
+            trial,
+            failureType,
+          })
+
+          trialResults.push({
+            trial,
+            score: 0,
+            durationMs: Date.now() - startTime,
+            success: false,
+          })
+          continue
+        }
 
-      if (debugEnabled && result.value.debug && debugRunDirectory) {
-        const artifact: DebugArtifact = {
-          agent: task.agent,
+        const labelParts: string[] = [agentInfo.label]
+        if (skillsEnabled) labelParts.push('Skills')
+        if (mcpEnabled) labelParts.push('MCP')
+        const score: Score = {
+          model: task.agent,
+          label: labelParts.join(' + '),
           framework: task.framework,
           category: task.category,
-          evaluationPath: task.evaluationPath,
-          score: result.value.score,
-          prompt: result.value.debug.prompt,
-          response: result.value.debug.response,
-          graders: result.value.debug.graders,
-          transcript: result.value.debug.transcript,
+          value: result.value.score,
+          updatedAt: new Date().toISOString(),
+          durationMs: result.value.durationMs,
         }
-        debugArtifacts.push(artifact)
-
-        // Write debug files
-        const evalSlug = task.variant
-          ? `${task.evaluationPath.replace(/\//g, '__')}__${task.variant}`
-          : task.evaluationPath.replace(/\//g, '__')
-        const debugPath = path.join(debugRunDirectory, `${evalSlug}__${task.agent}.json`)
-        await writeFile(debugPath, JSON.stringify(result.value.debug, null, 2))
-
-        if (result.value.debug.transcript) {
-          const transcriptPath = path.join(debugRunDirectory, `${evalSlug}__${task.agent}.md`)
-          await writeFile(transcriptPath, result.value.debug.transcript)
+        saveResult(runId, score, task.evaluationPath)
+
+        trialResults.push({
+          trial,
+          score: result.value.score,
+          durationMs: result.value.durationMs ?? Date.now() - startTime,
+          success: result.value.score >= 0.5,
+        })
+
+        if (debugEnabled && result.value.debug && debugRunDirectory) {
+          const artifact: DebugArtifact = {
+            agent: task.agent,
+            framework: task.framework,
+            category: task.category,
+            evaluationPath: task.evaluationPath,
+            score: result.value.score,
+            prompt: result.value.debug.prompt,
+            response: result.value.debug.response,
+            graders: result.value.debug.graders,
+            transcript: result.value.debug.transcript,
+          }
+          debugArtifacts.push(artifact)
+
+          // Write debug files
+          const evalSlug = task.variant
+            ? `${task.evaluationPath.replace(/\//g, '__')}__${task.variant}`
+            : task.evaluationPath.replace(/\//g, '__')
+          const trialSuffix = runsCount > 1 ? `__trial${trial}` : ''
+          const debugPath = path.join(
+            debugRunDirectory,
+            `${evalSlug}__${task.agent}${trialSuffix}.json`,
+          )
+          await writeFile(debugPath, JSON.stringify(result.value.debug, null, 2))
+
+          if (result.value.debug.transcript) {
+            const transcriptPath = path.join(
+              debugRunDirectory,
+              `${evalSlug}__${task.agent}${trialSuffix}.md`,
+            )
+            await writeFile(transcriptPath, result.value.debug.transcript)
+          }
         }
+      } finally {
+        completed++
+        console.log(
+          `[done ${completed}/${totalRuns}] ${task.agent} -> ${task.evaluationPath}${trialLabel}`,
+        )
       }
-    } finally {
-      completed++
-      console.log(`[done ${completed}/${tasks.length}] ${task.agent} -> ${task.evaluationPath}`)
+    }
+
+    // Log multi-trial summary
+    if (runsCount > 1 && trialResults.length > 0) {
+      const summary = summarizeTrials(trialResults)
+      console.log(
+        `[summary] ${task.evaluationPath}: ${summary.passed}/${summary.totalTrials} passed, ` +
+          `pass@1=${(summary.passAt1 * 100).toFixed(0)}%, ` +
+          `pass@${runsCount}=${(summary.passAtK * 100).toFixed(0)}%, ` +
+          `mean=${(summary.meanScore * 100).toFixed(0)}%`,
+      )
     }
   }),
 )