Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 24 additions & 42 deletions apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
"use server"

import * as path from "path"
import fs from "fs"
import { fileURLToPath } from "url"
import { spawn } from "child_process"

import { revalidatePath } from "next/cache"
import pMap from "p-map"
Expand All @@ -15,18 +13,25 @@ import {
deleteRun as _deleteRun,
createTask,
getExercisesForLanguage,
updateRun as _updateRun,
} from "@roo-code/evals"

import { CreateRun } from "@/lib/schemas"
import { enqueueRun, dequeueRun } from "@/lib/server/queue"
import { startQueueProcessor } from "@/lib/server/queue-processor"

const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")

// Start the queue processor when the server starts
startQueueProcessor().catch(console.error)

// eslint-disable-next-line @typescript-eslint/no-unused-vars
export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) {
const run = await _createRun({
...values,
timeout,
socketPath: "", // TODO: Get rid of this.
status: "queued", // Set initial status to queued
})

if (suite === "partial") {
Expand All @@ -49,52 +54,29 @@ export async function createRun({ suite, exercises = [], systemPrompt, timeout,
}
}

revalidatePath("/runs")

try {
const isRunningInDocker = fs.existsSync("/.dockerenv")

const dockerArgs = [
`--name evals-controller-${run.id}`,
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${run.id}`
// Add run to queue and get position
const queuePosition = await enqueueRun(run.id)

const command = isRunningInDocker
? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"`
: cliCommand

console.log("spawn ->", command)

const childProcess = spawn("sh", ["-c", command], {
detached: true,
stdio: ["ignore", "pipe", "pipe"],
})

const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" })

if (childProcess.stdout) {
childProcess.stdout.pipe(logStream)
}
// Update run with queue position
await _updateRun(run.id, { queuePosition })

if (childProcess.stderr) {
childProcess.stderr.pipe(logStream)
}

childProcess.unref()
} catch (error) {
console.error(error)
}
revalidatePath("/runs")

return run
return { ...run, queuePosition }
}

export async function deleteRun(runId: number) {
// Try to remove from queue if it's queued
await dequeueRun(runId)

await _deleteRun(runId)
revalidatePath("/runs")
}

export async function cancelRun(runId: number) {
// Import the cancelQueuedRun function
const { cancelQueuedRun } = await import("@/lib/server/queue-processor")

await cancelQueuedRun(runId)
revalidatePath("/runs")
}
12 changes: 12 additions & 0 deletions apps/web-evals/src/app/api/queue/status/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { NextResponse } from "next/server"
import { getQueueStats } from "@/lib/server/queue"

export async function GET() {
try {
const stats = await getQueueStats()
return NextResponse.json(stats)
} catch (error) {
console.error("Error fetching queue stats:", error)
return NextResponse.json({ error: "Failed to fetch queue status" }, { status: 500 })
}
}
95 changes: 86 additions & 9 deletions apps/web-evals/src/components/home/run.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import { useCallback, useState, useRef } from "react"
import Link from "next/link"
import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash } from "lucide-react"
import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Clock, Play, X } from "lucide-react"

import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals"

import { deleteRun } from "@/actions/runs"
import { deleteRun, cancelRun } from "@/actions/runs"
import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
import { useCopyRun } from "@/hooks/use-copy-run"
import {
Expand All @@ -23,16 +23,19 @@ import {
AlertDialogFooter,
AlertDialogHeader,
AlertDialogTitle,
Badge,
} from "@/components/ui"

type RunProps = {
run: EvalsRun
run: EvalsRun & { status?: string; queuePosition?: number | null }
taskMetrics: EvalsTaskMetrics | null
}

export function Run({ run, taskMetrics }: RunProps) {
const [deleteRunId, setDeleteRunId] = useState<number>()
const [cancelRunId, setCancelRunId] = useState<number>()
const continueRef = useRef<HTMLButtonElement>(null)
const cancelRef = useRef<HTMLButtonElement>(null)
const { isPending, copyRun, copied } = useCopyRun(run.id)

const onConfirmDelete = useCallback(async () => {
Expand All @@ -48,16 +51,62 @@ export function Run({ run, taskMetrics }: RunProps) {
}
}, [deleteRunId])

const onConfirmCancel = useCallback(async () => {
if (!cancelRunId) {
return
}

try {
await cancelRun(cancelRunId)
setCancelRunId(undefined)
} catch (error) {
console.error(error)
}
}, [cancelRunId])

const getStatusBadge = () => {
if (run.status === "queued") {
return (
<Badge variant="secondary" className="gap-1">
<Clock className="h-3 w-3" />
Queued {run.queuePosition ? `#${run.queuePosition}` : ""}
</Badge>
)
} else if (run.status === "running") {
return (
<Badge variant="default" className="gap-1">
<Play className="h-3 w-3" />
Running
</Badge>
)
} else if (run.status === "cancelled") {
return (
<Badge variant="destructive" className="gap-1">
<X className="h-3 w-3" />
Cancelled
</Badge>
)
}
return null
}

return (
<>
<TableRow>
<TableCell>{run.model}</TableCell>
<TableCell>{run.passed}</TableCell>
<TableCell>{run.failed}</TableCell>
<TableCell>
{run.passed + run.failed > 0 && (
<span>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</span>
)}
<div className="flex items-center gap-2">
{run.model}
{getStatusBadge()}
</div>
</TableCell>
<TableCell>{run.status === "completed" || run.status === "failed" ? run.passed : "-"}</TableCell>
<TableCell>{run.status === "completed" || run.status === "failed" ? run.failed : "-"}</TableCell>
<TableCell>
{run.status === "completed" || run.status === "failed"
? run.passed + run.failed > 0 && (
<span>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</span>
)
: "-"}
</TableCell>
<TableCell>
{taskMetrics && (
Expand Down Expand Up @@ -116,6 +165,18 @@ export function Run({ run, taskMetrics }: RunProps) {
</div>
</DropdownMenuItem>
)}
{run.status === "queued" && (
<DropdownMenuItem
onClick={() => {
setCancelRunId(run.id)
setTimeout(() => cancelRef.current?.focus(), 0)
}}>
<div className="flex items-center gap-1">
<X />
<div>Cancel</div>
</div>
</DropdownMenuItem>
)}
<DropdownMenuItem
onClick={() => {
setDeleteRunId(run.id)
Expand Down Expand Up @@ -144,6 +205,22 @@ export function Run({ run, taskMetrics }: RunProps) {
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
<AlertDialog open={!!cancelRunId} onOpenChange={() => setCancelRunId(undefined)}>
<AlertDialogContent>
<AlertDialogHeader>
<AlertDialogTitle>Cancel queued run?</AlertDialogTitle>
<AlertDialogDescription>
This will remove the run from the queue. The run will not be executed.
</AlertDialogDescription>
</AlertDialogHeader>
<AlertDialogFooter>
<AlertDialogCancel>Keep in Queue</AlertDialogCancel>
<AlertDialogAction ref={cancelRef} onClick={onConfirmCancel}>
Cancel Run
</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
</>
)
}
Loading
Loading