Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions apps/web-evals/src/actions/queue.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"use server"

import fs from "fs"
import { spawn } from "child_process"
import { revalidatePath } from "next/cache"

import { deleteRun as _deleteRun } from "@roo-code/evals"

import { redisClient } from "@/lib/server/redis"

const RUN_QUEUE_KEY = "evals:run-queue"
const ACTIVE_RUN_KEY = "evals:active-run"
const DISPATCH_LOCK_KEY = "evals:dispatcher:lock"
const ACTIVE_RUN_TTL_SECONDS = 60 * 60 * 12 // 12 hours
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 12-hour TTL seems quite generous. If a run crashes without clearing the active marker, the queue could be blocked for up to 12 hours. Consider:

  1. A shorter TTL (e.g., 2-4 hours)
  2. Implementing a heartbeat mechanism to refresh the TTL periodically
  3. Adding a manual "unlock" admin action for stuck queues

const DISPATCH_LOCK_TTL_SECONDS = 30

async function spawnController(runId: number) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is duplicated in packages/evals/src/cli/queue.ts. Could we extract this to a shared utility to maintain DRY principles and ensure consistency?

const isRunningInDocker = fs.existsSync("/.dockerenv")

const dockerArgs = [
`--name evals-controller-${runId}`,
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${runId}`

const command = isRunningInDocker
? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"`
: cliCommand

const childProcess = spawn("sh", ["-c", command], {
detached: true,
stdio: ["ignore", "pipe", "pipe"],
})

// Best-effort logging of controller output
try {
const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" })
childProcess.stdout?.pipe(logStream)
childProcess.stderr?.pipe(logStream)
} catch (_error) {
// Intentionally ignore logging pipe errors
}

childProcess.unref()
}

/**
* Enqueue a run into the global FIFO (idempotent).
*/
export async function enqueueRun(runId: number) {
const redis = await redisClient()
const exists = await redis.lPos(RUN_QUEUE_KEY, runId.toString())
if (exists === null) {
await redis.rPush(RUN_QUEUE_KEY, runId.toString())
}
revalidatePath("/runs")
}

/**
* Dispatcher: if no active run, pop next from queue and start controller.
* Uses a short-lived lock to avoid races between concurrent dispatchers.
*/
export async function dispatchNextRun() {
const redis = await redisClient()

// Try to acquire dispatcher lock
const locked = await redis.set(DISPATCH_LOCK_KEY, "1", { NX: true, EX: DISPATCH_LOCK_TTL_SECONDS })
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a potential race condition here? If the dispatcher lock expires (30s) while we're still processing, another dispatcher could start processing the same queue. Consider either:

  1. Extending the lock TTL if processing takes longer
  2. Adding a heartbeat to refresh the lock
  3. Using a longer initial TTL

if (!locked) return

try {
// If an active run is present, nothing to do.
const active = await redis.get(ACTIVE_RUN_KEY)
if (active) return

const nextId = await redis.lPop(RUN_QUEUE_KEY)
if (!nextId) return

const ok = await redis.set(ACTIVE_RUN_KEY, nextId, { NX: true, EX: ACTIVE_RUN_TTL_SECONDS })
if (!ok) {
// put it back to preserve order and exit
await redis.lPush(RUN_QUEUE_KEY, nextId)
return
}

await spawnController(Number(nextId))
} finally {
await redis.del(DISPATCH_LOCK_KEY).catch(() => {})
Copy link

Copilot AI Sep 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using an empty catch block silently ignores all errors. Consider adding a comment explaining why errors are being ignored or log the error for debugging purposes.

Suggested change
await redis.del(DISPATCH_LOCK_KEY).catch(() => {})
await redis.del(DISPATCH_LOCK_KEY).catch((err) => {
console.error("Failed to delete dispatcher lock key:", err)
})

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Copilot - could we add error logging here for debugging purposes? Silent failures make troubleshooting difficult in production.

Suggested change
await redis.del(DISPATCH_LOCK_KEY).catch(() => {})
await redis.del(DISPATCH_LOCK_KEY).catch((err) => {
console.error("Failed to delete dispatcher lock:", err)
})

}
}

/**
* Return 1-based position in the global FIFO queue, or null if not queued.
*/
export async function getQueuePosition(runId: number): Promise<number | null> {
const redis = await redisClient()
const idx = await redis.lPos(RUN_QUEUE_KEY, runId.toString())
return idx === null ? null : idx + 1
}

/**
* Remove a queued run from the FIFO queue and delete the run record.
*/
export async function cancelQueuedRun(runId: number) {
const redis = await redisClient()
await redis.lRem(RUN_QUEUE_KEY, 1, runId.toString())
await _deleteRun(runId)
revalidatePath("/runs")
}
42 changes: 5 additions & 37 deletions apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"use server"

import * as path from "path"
import fs from "fs"
import { fileURLToPath } from "url"
import { spawn } from "child_process"

import { enqueueRun, dispatchNextRun } from "@/actions/queue"

import { revalidatePath } from "next/cache"
import pMap from "p-map"
Expand Down Expand Up @@ -52,41 +52,9 @@ export async function createRun({ suite, exercises = [], systemPrompt, timeout,
revalidatePath("/runs")

try {
const isRunningInDocker = fs.existsSync("/.dockerenv")

const dockerArgs = [
`--name evals-controller-${run.id}`,
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${run.id}`

const command = isRunningInDocker
? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"`
: cliCommand

console.log("spawn ->", command)

const childProcess = spawn("sh", ["-c", command], {
detached: true,
stdio: ["ignore", "pipe", "pipe"],
})

const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" })

if (childProcess.stdout) {
childProcess.stdout.pipe(logStream)
}

if (childProcess.stderr) {
childProcess.stderr.pipe(logStream)
}

childProcess.unref()
// Enqueue the run and attempt to dispatch if no active run exists.
await enqueueRun(run.id)
await dispatchNextRun()
} catch (error) {
console.error(error)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error handling only logs to console. Should we consider:

  1. Throwing the error to surface it to the UI?
  2. Adding telemetry/monitoring?
  3. Returning an error status to the caller?

Silent failures could leave users confused about why their run isn't queued.

}
Expand Down
40 changes: 39 additions & 1 deletion apps/web-evals/src/components/home/run.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { useCallback, useState, useRef } from "react"
import Link from "next/link"
import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash } from "lucide-react"
import { useQuery } from "@tanstack/react-query"
import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, XCircle } from "lucide-react"

import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals"

import { deleteRun } from "@/actions/runs"
import { getHeartbeat } from "@/actions/heartbeat"
import { getQueuePosition, cancelQueuedRun } from "@/actions/queue"
import { formatCurrency, formatDuration, formatTokens, formatToolUsageSuccessRate } from "@/lib/formatters"
import { useCopyRun } from "@/hooks/use-copy-run"
import {
Expand Down Expand Up @@ -35,6 +38,23 @@ export function Run({ run, taskMetrics }: RunProps) {
const continueRef = useRef<HTMLButtonElement>(null)
const { isPending, copyRun, copied } = useCopyRun(run.id)

// Poll heartbeat and queue position for status column
const { data: heartbeat } = useQuery({
queryKey: ["getHeartbeat", run.id],
queryFn: () => getHeartbeat(run.id),
refetchInterval: 10_000,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 10-second polling interval might be excessive for long-running queues. Consider making this configurable or using a progressive interval (e.g., start at 10s, increase to 30s after a few polls)?

})

const { data: queuePosition } = useQuery({
queryKey: ["getQueuePosition", run.id],
queryFn: () => getQueuePosition(run.id),
refetchInterval: 10_000,
})

const isCompleted = !!run.taskMetricsId
const isRunning = !!heartbeat
const isQueued = !isCompleted && !isRunning && queuePosition !== null && queuePosition !== undefined
Copy link

Copilot AI Sep 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition queuePosition !== null && queuePosition !== undefined can be simplified to queuePosition != null which checks for both null and undefined in a single comparison.

Suggested change
const isQueued = !isCompleted && !isRunning && queuePosition !== null && queuePosition !== undefined
const isQueued = !isCompleted && !isRunning && queuePosition != null

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Copilot's suggestion here. Could we simplify this to use queuePosition != null which checks for both null and undefined?

Suggested change
const isQueued = !isCompleted && !isRunning && queuePosition !== null && queuePosition !== undefined
const isQueued = !isCompleted && !isRunning && queuePosition != null


const onConfirmDelete = useCallback(async () => {
if (!deleteRunId) {
return
Expand All @@ -51,6 +71,9 @@ export function Run({ run, taskMetrics }: RunProps) {
return (
<>
<TableRow>
<TableCell>
{isCompleted ? "Completed" : isRunning ? "Running" : isQueued ? <>Queued (#{queuePosition})</> : ""}
</TableCell>
<TableCell>{run.model}</TableCell>
<TableCell>{run.passed}</TableCell>
<TableCell>{run.failed}</TableCell>
Expand Down Expand Up @@ -116,6 +139,21 @@ export function Run({ run, taskMetrics }: RunProps) {
</div>
</DropdownMenuItem>
)}
{isQueued && (
<DropdownMenuItem
onClick={async () => {
try {
await cancelQueuedRun(run.id)
} catch (error) {
console.error(error)
}
}}>
<div className="flex items-center gap-1">
<XCircle />
<div>Cancel</div>
</div>
</DropdownMenuItem>
)}
<DropdownMenuItem
onClick={() => {
setDeleteRunId(run.id)
Expand Down
3 changes: 2 additions & 1 deletion apps/web-evals/src/components/home/runs.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
<Table className="border border-t-0">
<TableHeader>
<TableRow>
<TableHead>Status</TableHead>
<TableHead>Model</TableHead>
<TableHead>Passed</TableHead>
<TableHead>Failed</TableHead>
Expand All @@ -34,7 +35,7 @@ export function Runs({ runs }: { runs: RunWithTaskMetrics[] }) {
runs.map(({ taskMetrics, ...run }) => <Row key={run.id} run={run} taskMetrics={taskMetrics} />)
) : (
<TableRow>
<TableCell colSpan={9} className="text-center">
<TableCell colSpan={10} className="text-center">
No eval runs yet.
<Button variant="link" onClick={() => router.push("/runs/new")}>
Launch
Expand Down
89 changes: 89 additions & 0 deletions packages/evals/src/cli/queue.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import fs from "node:fs"
import { spawn } from "node:child_process"

import { redisClient } from "./redis.js"
import { isDockerContainer } from "./utils.js"

const RUN_QUEUE_KEY = "evals:run-queue"
const ACTIVE_RUN_KEY = "evals:active-run"
const DISPATCH_LOCK_KEY = "evals:dispatcher:lock"
const ACTIVE_RUN_TTL_SECONDS = 60 * 60 * 12 // 12 hours
const DISPATCH_LOCK_TTL_SECONDS = 30

async function spawnController(runId: number) {
const containerized = isDockerContainer()

const dockerArgs = [
`--name evals-controller-${runId}`,
"--rm",
"--network evals_default",
"-v /var/run/docker.sock:/var/run/docker.sock",
"-v /tmp/evals:/var/log/evals",
"-e HOST_EXECUTION_METHOD=docker",
]

const cliCommand = `pnpm --filter @roo-code/evals cli --runId ${runId}`
const command = containerized ? `docker run ${dockerArgs.join(" ")} evals-runner sh -c "${cliCommand}"` : cliCommand

const childProcess = spawn("sh", ["-c", command], {
detached: true,
stdio: ["ignore", "pipe", "pipe"],
})

// Best-effort logging of controller output (host path or container path)
try {
const logStream = fs.createWriteStream("/tmp/roo-code-evals.log", { flags: "a" })
childProcess.stdout?.pipe(logStream)
childProcess.stderr?.pipe(logStream)
} catch {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty catch blocks here and at line 85. For consistency with the web implementation, should we at least add a comment explaining why errors are ignored, or consider logging them?

// ignore logging errors
}

childProcess.unref()
}

/**
* Clear the active-run marker (if any) and try to dispatch the next run in FIFO order.
* Uses a short-lived lock to avoid races with other dispatchers (web app or other controllers).
*/
export async function finishActiveRunAndDispatch() {
const redis = await redisClient()

// Clear the active run marker first (if exists). We do not care if it was already expired.
try {
await redis.del(ACTIVE_RUN_KEY)
} catch {
// ignore
}

// Try to acquire dispatcher lock (NX+EX). If we don't get it, another dispatcher will handle it.
const locked = await redis.set(DISPATCH_LOCK_KEY, "1", { NX: true, EX: DISPATCH_LOCK_TTL_SECONDS })
if (!locked) return

try {
// If another process re-marked active-run meanwhile, bail out.
const active = await redis.get(ACTIVE_RUN_KEY)
if (active) return

// Pop next run id from the head of the queue.
const nextId = await redis.lPop(RUN_QUEUE_KEY)
if (!nextId) return

// Mark as active (with TTL) to provide crash safety.
const ok = await redis.set(ACTIVE_RUN_KEY, nextId, { NX: true, EX: ACTIVE_RUN_TTL_SECONDS })
if (!ok) {
// Could not set active (race). Push id back to the head to preserve order and exit.
await redis.lPush(RUN_QUEUE_KEY, nextId)
return
}

// Spawn the next controller in background.
await spawnController(Number(nextId))
} finally {
try {
await redis.del(DISPATCH_LOCK_KEY)
} catch {
// ignore
}
}
}
2 changes: 2 additions & 0 deletions packages/evals/src/cli/runEvals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { EVALS_REPO_PATH } from "../exercises/index.js"
import { Logger, getTag, isDockerContainer, resetEvalsRepo, commitEvalsRepoChanges } from "./utils.js"
import { startHeartbeat, stopHeartbeat } from "./redis.js"
import { processTask, processTaskInContainer } from "./runTask.js"
import { finishActiveRunAndDispatch } from "./queue.js"

export const runEvals = async (runId: number) => {
const run = await findRun(runId)
Expand Down Expand Up @@ -67,6 +68,7 @@ export const runEvals = async (runId: number) => {
} finally {
logger.info("cleaning up")
stopHeartbeat(run.id, heartbeat)
await finishActiveRunAndDispatch()
logger.close()
}
}
Loading
Loading