Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions evals/apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
"execa": "^9.5.2",
"gluegun": "^5.1.2",
"p-map": "^7.0.3",
"p-wait-for": "^5.0.2"
"p-wait-for": "^5.0.2",
"ps-tree": "^1.2.0"
},
"devDependencies": {
"@evals/eslint-config": "workspace:^",
"@evals/typescript-config": "workspace:^"
"@evals/typescript-config": "workspace:^",
"@types/ps-tree": "^1.1.6"
}
}
111 changes: 76 additions & 35 deletions evals/apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import pMap from "p-map"
import pWaitFor from "p-wait-for"
import { execa, parseCommandString } from "execa"
import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
import psTree from "ps-tree"

import {
type ExerciseLanguage,
Expand Down Expand Up @@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
type TaskResult = { success: boolean; retry: boolean }
type TaskPromise = Promise<TaskResult>

const TASK_TIMEOUT = 10 * 60 * 1_000
const UNIT_TEST_TIMEOUT = 60 * 1_000
const TASK_START_DELAY = 10 * 1_000
const TASK_TIMEOUT = 5 * 60 * 1_000
const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000

const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
Expand Down Expand Up @@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
throw new Error("No tasks found.")
}

console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
console.log(
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
)
await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`
await execa({ cwd: exercisesPath })`git checkout -f`
await execa({ cwd: exercisesPath })`git clean -fd`
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`

fs.writeFileSync(
path.resolve(exercisesPath, "settings.json"),
Expand Down Expand Up @@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
}
}

let delay = 0
let delay = TASK_START_DELAY

for (const task of tasks) {
const promise = processTask(task, delay)
delay = delay + 5_000
delay = delay + TASK_START_DELAY
runningPromises.push(promise)
promise.then(() => processTaskResult(task, promise))

Expand All @@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
await Promise.all(runningPromises)

const result = await finishRun(run.id)
console.log("[cli#run]", result)
console.log(`${Date.now()} [cli#run]`, result)

console.log(await execa({ cwd: exercisesPath })`git add .`)
console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
await execa({ cwd: exercisesPath })`git add .`
await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
}

const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
Expand All @@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
// Don't await execa and store result as subprocess.
// subprocess.stdout.pipe(process.stdout)

// Sleep for a random amount of time before opening a new VSCode window.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
console.log(`Opening new VS Code window at ${workspacePath}`)
console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)

await execa({
env: {
Expand All @@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
})`code --disable-workspace-trust -n ${workspacePath}`

// Give VSCode some time to spawn before connecting to its unix socket.
await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
console.log(`Connecting to ${taskSocketPath}`)
await new Promise((resolve) => setTimeout(resolve, 3_000))
console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
const client = new IpcClient(taskSocketPath)

try {
await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
client.disconnect()
return { success: false, retry: false }
}
Expand All @@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
const { eventName, payload } = taskEvent

server.broadcast({
type: IpcMessageType.TaskEvent,
origin: IpcOrigin.Server,
relayClientId: client.clientId!,
data: { ...taskEvent, taskId: task.id },
})
if (taskEvent.eventName !== RooCodeEventName.Message) {
server.broadcast({
type: IpcMessageType.TaskEvent,
origin: IpcOrigin.Server,
relayClientId: client.clientId!,
data: { ...taskEvent, taskId: task.id },
})
}

if (!ignoreEvents.includes(eventName)) {
console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
console.log(payload)
console.log(
`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
payload,
)
}

if (eventName === RooCodeEventName.TaskStarted) {
Expand Down Expand Up @@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
})

client.on(IpcMessageType.Disconnect, async () => {
console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
isClientDisconnected = true
})

console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)

client.sendMessage({
type: IpcMessageType.TaskCommand,
Expand All @@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
// eslint-disable-next-line @typescript-eslint/no-unused-vars
} catch (error) {
console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)

// Cancel the task.
if (rooTaskId && !isClientDisconnected) {
Expand Down Expand Up @@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
let passed = true

for (const command of commands) {
const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT

try {
const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
)
const subprocess = execa({ cwd, shell: true, reject: false })`${command}`

const timeout = setTimeout(async () => {
const descendants = await new Promise<number[]>((resolve, reject) => {
psTree(subprocess.pid!, (err, children) => {
if (err) {
reject(err)
}

resolve(children.map((p) => parseInt(p.PID)))
})
})

if (descendants.length > 0) {
try {
console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
)

await execa`kill -9 ${descendants.join(" ")}`
} catch (error) {
console.error("Error killing descendant processes:", error)
}
}

console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
)

await execa`kill -9 ${subprocess.pid!}`
}, UNIT_TEST_TIMEOUT)

const result = await subprocess

console.log(
`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
)

clearTimeout(timeout)

if (result.failed) {
passed = false
break
}
} catch (error) {
console.log("[cli#runUnitTest]", error)
console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
passed = false
break
}
Expand Down
97 changes: 41 additions & 56 deletions evals/apps/web/src/app/runs/[id]/run.tsx
Original file line number Diff line number Diff line change
@@ -1,33 +1,44 @@
"use client"

import { useState, useRef } from "react"
import { LoaderCircle, SquareTerminal } from "lucide-react"
import { useMemo } from "react"
import { LoaderCircle } from "lucide-react"

import * as db from "@evals/db"

import { formatCurrency, formatDuration, formatTokens } from "@/lib"
import { useRunStatus } from "@/hooks/use-run-status"
import {
Drawer,
DrawerContent,
DrawerHeader,
DrawerTitle,
ScrollArea,
Table,
TableBody,
TableCell,
TableHead,
TableHeader,
TableRow,
} from "@/components/ui"
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"

import { TaskStatus } from "./task-status"
import { ConnectionStatus } from "./connection-status"

type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">

export function Run({ run }: { run: db.Run }) {
const { tasks, status, output, outputCounts } = useRunStatus(run)
const scrollAreaRef = useRef<HTMLDivElement>(null)
const [selectedTask, setSelectedTask] = useState<db.Task>()
const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)

const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
const metrics: Record<number, TaskMetrics> = {}

tasks?.forEach((task) => {
const usage = tokenUsage.get(task.id)

if (task.finishedAt && task.taskMetrics) {
metrics[task.id] = task.taskMetrics
} else if (usage) {
metrics[task.id] = {
tokensIn: usage.totalTokensIn,
tokensOut: usage.totalTokensOut,
tokensContext: usage.contextTokens,
duration: usage.duration ?? 0,
cost: usage.totalCost,
}
}
})

return metrics
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [tasks, tokenUsage, usageUpdatedAt])

return (
<>
Expand Down Expand Up @@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
<TableRow key={task.id}>
<TableCell>
<div className="flex items-center gap-2">
<TaskStatus task={task} />
<TaskStatus
task={task}
running={!!task.startedAt || !!tokenUsage.get(task.id)}
/>
<div>
{task.language}/{task.exercise}
</div>
{(outputCounts[task.id] ?? 0) > 0 && (
<div
className="flex items-center gap-1 cursor-pointer"
onClick={() => setSelectedTask(task)}>
<SquareTerminal className="size-4" />
<div className="font-mono text-xs text-foreground/50">
{outputCounts[task.id]}
</div>
</div>
)}
</div>
</TableCell>
{task.taskMetrics ? (
{taskMetrics[task.id] ? (
<>
<TableCell className="font-mono text-xs">
<div className="flex items-center justify-evenly">
<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
</div>
</TableCell>
<TableCell className="font-mono text-xs">
{formatTokens(task.taskMetrics.tokensContext)}
{formatTokens(taskMetrics[task.id]!.tokensContext)}
</TableCell>
<TableCell className="font-mono text-xs">
{formatDuration(task.taskMetrics.duration)}
{taskMetrics[task.id]!.duration
? formatDuration(taskMetrics[task.id]!.duration)
: "-"}
</TableCell>
<TableCell className="font-mono text-xs">
{formatCurrency(task.taskMetrics.cost)}
{formatCurrency(taskMetrics[task.id]!.cost)}
</TableCell>
</>
) : (
Expand All @@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
</Table>
)}
</div>
<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
<DrawerContent>
<div className="mx-auto w-full max-w-2xl">
<DrawerHeader>
<DrawerTitle>
{selectedTask?.language}/{selectedTask?.exercise}
</DrawerTitle>
</DrawerHeader>
<div className="font-mono text-xs pb-12">
{selectedTask && (
<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
<div className="p-4">
<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
</div>
</ScrollArea>
)}
</div>
</div>
</DrawerContent>
</Drawer>
</>
)
}
7 changes: 3 additions & 4 deletions evals/apps/web/src/app/runs/[id]/task-status.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ import { type Task } from "@evals/db"

type TaskStatusProps = {
task: Task
running: boolean
}

export const TaskStatus = ({ task }: TaskStatusProps) => {
export const TaskStatus = ({ task, running }: TaskStatusProps) => {
return task.passed === false ? (
<CircleSlash className="size-4 text-destructive" />
) : task.passed === true ? (
<CircleCheck className="size-4 text-green-500" />
) : task.startedAt ? (
<LoaderCircle className="size-4 animate-spin" />
) : task.finishedAt ? (
) : running ? (
<LoaderCircle className="size-4 animate-spin" />
) : (
<CircleDashed className="size-4" />
Expand Down
Loading