Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions evals/apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
"execa": "^9.5.2",
"gluegun": "^5.1.2",
"p-map": "^7.0.3",
"p-wait-for": "^5.0.2"
"p-wait-for": "^5.0.2",
"ps-tree": "^1.2.0"
},
"devDependencies": {
"@evals/eslint-config": "workspace:^",
"@evals/typescript-config": "workspace:^"
"@evals/typescript-config": "workspace:^",
"@types/ps-tree": "^1.1.6"
}
}
111 changes: 76 additions & 35 deletions evals/apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import pMap from "p-map"
import pWaitFor from "p-wait-for"
import { execa, parseCommandString } from "execa"
import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
import psTree from "ps-tree"

import {
type ExerciseLanguage,
Expand Down Expand Up @@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
type TaskResult = { success: boolean; retry: boolean }
type TaskPromise = Promise<TaskResult>

const TASK_TIMEOUT = 10 * 60 * 1_000
const UNIT_TEST_TIMEOUT = 60 * 1_000
const TASK_START_DELAY = 10 * 1_000
const TASK_TIMEOUT = 5 * 60 * 1_000
const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000

const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
Expand Down Expand Up @@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
throw new Error("No tasks found.")
}

console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
console.log(
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
)
await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
await execa({ cwd: exercisesPath })`git checkout -f`
await execa({ cwd: exercisesPath })`git clean -fd`
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`

fs.writeFileSync(
path.resolve(exercisesPath, "settings.json"),
Expand Down Expand Up @@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
}
}

let delay = 0
let delay = TASK_START_DELAY

for (const task of tasks) {
const promise = processTask(task, delay)
delay = delay + 5_000
delay = delay + TASK_START_DELAY
runningPromises.push(promise)
promise.then(() => processTaskResult(task, promise))

Expand All @@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
await Promise.all(runningPromises)

const result = await finishRun(run.id)
console.log("[cli#run]", result)
console.log(`${Date.now()} [cli#run]`, result)

console.log(await execa({ cwd: exercisesPath })`git add .`)
console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
await execa({ cwd: exercisesPath })`git add .`
await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
}

const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
Expand All @@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
// Don't await execa and store result as subprocess.
// subprocess.stdout.pipe(process.stdout)

// Sleep for a random amount of time before opening a new VSCode window.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
console.log(`Opening new VS Code window at ${workspacePath}`)
console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)

await execa({
env: {
Expand All @@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
})`code --disable-workspace-trust -n ${workspacePath}`

// Give VSCode some time to spawn before connecting to its unix socket.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
console.log(`Connecting to ${taskSocketPath}`)
await new Promise((resolve) => setTimeout(resolve, 3_000))
console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
const client = new IpcClient(taskSocketPath)

try {
await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
client.disconnect()
return { success: false, retry: false }
}
Expand All @@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
const { eventName, payload } = taskEvent

server.broadcast({
type: IpcMessageType.TaskEvent,
origin: IpcOrigin.Server,
relayClientId: client.clientId!,
data: { ...taskEvent, taskId: task.id },
})
if (taskEvent.eventName !== RooCodeEventName.Message) {
server.broadcast({
type: IpcMessageType.TaskEvent,
origin: IpcOrigin.Server,
relayClientId: client.clientId!,
data: { ...taskEvent, taskId: task.id },
})
}

if (!ignoreEvents.includes(eventName)) {
console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
console.log(payload)
console.log(
`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
payload,
)
}

if (eventName === RooCodeEventName.TaskStarted) {
Expand Down Expand Up @@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
})

client.on(IpcMessageType.Disconnect, async () => {
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
isClientDisconnected = true
})

console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)

client.sendMessage({
type: IpcMessageType.TaskCommand,
Expand All @@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)

// Cancel the task.
if (rooTaskId && !isClientDisconnected) {
Expand Down Expand Up @@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
let passed = true

for (const command of commands) {
const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT

try {
const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
)
const subprocess = execa({ cwd, shell: true, reject: false })`${command}`

const timeout = setTimeout(async () => {
const descendants = await new Promise<number[]>((resolve, reject) => {
psTree(subprocess.pid!, (err, children) => {
if (err) {
reject(err)
}

resolve(children.map((p) => parseInt(p.PID)))
})
})

if (descendants.length > 0) {
try {
console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
)

await execa`kill -9 ${descendants.join(" ")}`
} catch (error) {
console.error("Error killing descendant processes:", error)
}
}

console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
)

await execa`kill -9 ${subprocess.pid!}`
}, UNIT_TEST_TIMEOUT)

const result = await subprocess

console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
)

clearTimeout(timeout)

if (result.failed) {
passed = false
break
}
} catch (error) {
console.log("[cli#runUnitTest]", error)
console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
passed = false
break
}
Expand Down
97 changes: 41 additions & 56 deletions evals/apps/web/src/app/runs/[id]/run.tsx
Original file line number Diff line number Diff line change
@@ -1,33 +1,44 @@
"use client"

import { useState, useRef } from "react"
import { LoaderCircle, SquareTerminal } from "lucide-react"
import { useMemo } from "react"
import { LoaderCircle } from "lucide-react"

import * as db from "@evals/db"

import { formatCurrency, formatDuration, formatTokens } from "@/lib"
import { useRunStatus } from "@/hooks/use-run-status"
import {
Drawer,
DrawerContent,
DrawerHeader,
DrawerTitle,
ScrollArea,
Table,
TableBody,
TableCell,
TableHead,
TableHeader,
TableRow,
} from "@/components/ui"
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"

import { TaskStatus } from "./task-status"
import { ConnectionStatus } from "./connection-status"

type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">

export function Run({ run }: { run: db.Run }) {
const { tasks, status, output, outputCounts } = useRunStatus(run)
const scrollAreaRef = useRef<HTMLDivElement>(null)
const [selectedTask, setSelectedTask] = useState<db.Task>()
const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)

const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
const metrics: Record<number, TaskMetrics> = {}

tasks?.forEach((task) => {
const usage = tokenUsage.get(task.id)

if (task.finishedAt && task.taskMetrics) {
metrics[task.id] = task.taskMetrics
} else if (usage) {
metrics[task.id] = {
tokensIn: usage.totalTokensIn,
tokensOut: usage.totalTokensOut,
tokensContext: usage.contextTokens,
duration: usage.duration ?? 0,
cost: usage.totalCost,
}
}
})

return metrics
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, tokenUsage, usageUpdatedAt])

return (
<>
Expand Down Expand Up @@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
<TableRow key={task.id}>
<TableCell>
<div className="flex items-center gap-2">
<TaskStatus task={task} />
<TaskStatus
task={task}
running={!!task.startedAt || !!tokenUsage.get(task.id)}
/>
<div>
{task.language}/{task.exercise}
</div>
{(outputCounts[task.id] ?? 0) > 0 && (
<div
className="flex items-center gap-1 cursor-pointer"
onClick={() => setSelectedTask(task)}>
<SquareTerminal className="size-4" />
<div className="font-mono text-xs text-foreground/50">
{outputCounts[task.id]}
</div>
</div>
)}
</div>
</TableCell>
{task.taskMetrics ? (
{taskMetrics[task.id] ? (
<>
<TableCell className="font-mono text-xs">
<div className="flex items-center justify-evenly">
<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
</div>
</TableCell>
<TableCell className="font-mono text-xs">
{formatTokens(task.taskMetrics.tokensContext)}
{formatTokens(taskMetrics[task.id]!.tokensContext)}
</TableCell>
<TableCell className="font-mono text-xs">
{formatDuration(task.taskMetrics.duration)}
{taskMetrics[task.id]!.duration
? formatDuration(taskMetrics[task.id]!.duration)
: "-"}
</TableCell>
<TableCell className="font-mono text-xs">
{formatCurrency(task.taskMetrics.cost)}
{formatCurrency(taskMetrics[task.id]!.cost)}
</TableCell>
</>
) : (
Expand All @@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
</Table>
)}
</div>
<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
<DrawerContent>
<div className="mx-auto w-full max-w-2xl">
<DrawerHeader>
<DrawerTitle>
{selectedTask?.language}/{selectedTask?.exercise}
</DrawerTitle>
</DrawerHeader>
<div className="font-mono text-xs pb-12">
{selectedTask && (
<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
<div className="p-4">
<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
</div>
</ScrollArea>
)}
</div>
</div>
</DrawerContent>
</Drawer>
</>
)
}
7 changes: 3 additions & 4 deletions evals/apps/web/src/app/runs/[id]/task-status.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ import { type Task } from "@evals/db"

type TaskStatusProps = {
task: Task
running: boolean
}

export const TaskStatus = ({ task }: TaskStatusProps) => {
export const TaskStatus = ({ task, running }: TaskStatusProps) => {
return task.passed === false ? (
<CircleSlash className="size-4 text-destructive" />
) : task.passed === true ? (
<CircleCheck className="size-4 text-green-500" />
) : task.startedAt ? (
<LoaderCircle className="size-4 animate-spin" />
) : task.finishedAt ? (
) : running ? (
<LoaderCircle className="size-4 animate-spin" />
) : (
<CircleDashed className="size-4" />
Expand Down
Loading