From 431a395ef1f6ad746b8d891d56d604e2ce1fe492 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Sun, 13 Apr 2025 00:49:51 -0700
Subject: [PATCH 1/2] Evals improvements

---
 evals/apps/cli/package.json                   |   6 +-
 evals/apps/cli/src/index.ts                   | 111 ++++++++++++------
 evals/apps/web/src/app/runs/[id]/run.tsx      |  97 +++++++--------
 .../web/src/app/runs/[id]/task-status.tsx     |   7 +-
 evals/apps/web/src/app/runs/new/new-run.tsx   |  14 ++-
 evals/apps/web/src/hooks/use-process-tree.ts  |   1 +
 evals/apps/web/src/hooks/use-run-status.ts    |  43 ++++---
 evals/packages/db/src/schema.ts               |   4 +-
 evals/packages/ipc/src/client.ts              |   2 +-
 evals/packages/ipc/src/server.ts              |   2 +-
 evals/packages/types/src/roo-code-defaults.ts |  32 ++---
 evals/packages/types/src/roo-code.ts          |  14 +++
 evals/pnpm-lock.yaml                          |   6 +
 evals/scripts/setup.sh                        |  33 ++++--
 14 files changed, 215 insertions(+), 157 deletions(-)
diff --git a/evals/apps/cli/package.json b/evals/apps/cli/package.json
index 3e7da0266e9..1b54765954c 100644
--- a/evals/apps/cli/package.json
+++ b/evals/apps/cli/package.json
@@ -16,10 +16,12 @@
 		"execa": "^9.5.2",
 		"gluegun": "^5.1.2",
 		"p-map": "^7.0.3",
-		"p-wait-for": "^5.0.2"
+		"p-wait-for": "^5.0.2",
+		"ps-tree": "^1.2.0"
 	},
 	"devDependencies": {
 		"@evals/eslint-config": "workspace:^",
-		"@evals/typescript-config": "workspace:^"
+		"@evals/typescript-config": "workspace:^",
+		"@types/ps-tree": "^1.1.6"
 	}
 }
diff --git a/evals/apps/cli/src/index.ts b/evals/apps/cli/src/index.ts
index 55474f15f86..2491b16ef68 100644
--- a/evals/apps/cli/src/index.ts
+++ b/evals/apps/cli/src/index.ts
@@ -6,6 +6,7 @@ import pMap from "p-map"
 import pWaitFor from "p-wait-for"
 import { execa, parseCommandString } from "execa"
 import { build, filesystem, GluegunPrompt, GluegunToolbox } from "gluegun"
+import psTree from "ps-tree"
 
 import {
 	type ExerciseLanguage,
@@ -36,8 +37,9 @@ import { getExercises } from "./exercises.js"
 type TaskResult = { success: boolean; retry: boolean }
 type TaskPromise = Promise<TaskResult>
 
-const TASK_TIMEOUT = 10 * 60 * 1_000
-const UNIT_TEST_TIMEOUT = 60 * 1_000
+const TASK_START_DELAY = 10 * 1_000
+const TASK_TIMEOUT = 5 * 60 * 1_000
+const UNIT_TEST_TIMEOUT = 2 * 60 * 1_000
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
@@ -98,13 +100,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		throw new Error("No tasks found.")
 	}
 
-	console.log(await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`)
-	console.log(await execa({ cwd: exercisesPath })`git config user.email "support@roocode.com"`)
-	console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
-	console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
-	console.log(
-		await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
-	)
+	await execa({ cwd: exercisesPath })`git config user.name "Roo Code"`
+	await execa({ cwd: exercisesPath })`git config user.email "support@roocode.com"`
+	await execa({ cwd: exercisesPath })`git checkout -f`
+	await execa({ cwd: exercisesPath })`git clean -fd`
+	await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`
 
 	fs.writeFileSync(
 		path.resolve(exercisesPath, "settings.json"),
@@ -145,11 +145,11 @@ const run = async (toolbox: GluegunToolbox) => {
 		}
 	}
 
-	let delay = 0
+	let delay = TASK_START_DELAY
 
 	for (const task of tasks) {
 		const promise = processTask(task, delay)
-		delay = delay + 5_000
+		delay = delay + TASK_START_DELAY
 		runningPromises.push(promise)
 		promise.then(() => processTaskResult(task, promise))
 
@@ -162,10 +162,10 @@ const run = async (toolbox: GluegunToolbox) => {
 	await Promise.all(runningPromises)
 
 	const result = await finishRun(run.id)
-	console.log("[cli#run]", result)
+	console.log(`${Date.now()} [cli#run]`, result)
 
-	console.log(await execa({ cwd: exercisesPath })`git add .`)
-	console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
+	await execa({ cwd: exercisesPath })`git add .`
+	await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`
 }
 
 const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
@@ -180,9 +180,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	// Don't await execa and store result as subprocess.
 	// subprocess.stdout.pipe(process.stdout)
 
-	// Sleep for a random amount of time before opening a new VSCode window.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 5_000))
-	console.log(`Opening new VS Code window at ${workspacePath}`)
+	console.log(`${Date.now()} [cli#runExercise] Opening new VS Code window at ${workspacePath}`)
 
 	await execa({
 		env: {
@@ -192,15 +190,15 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})`code --disable-workspace-trust -n ${workspacePath}`
 
 	// Give VSCode some time to spawn before connecting to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
-	console.log(`Connecting to ${taskSocketPath}`)
+	await new Promise((resolve) => setTimeout(resolve, 3_000))
+	console.log(`${Date.now()} [cli#runExercise] Connecting to ${taskSocketPath}`)
 	const client = new IpcClient(taskSocketPath)
 
 	try {
 		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] unable to connect`)
 		client.disconnect()
 		return { success: false, retry: false }
 	}
@@ -220,16 +218,20 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 
-		server.broadcast({
-			type: IpcMessageType.TaskEvent,
-			origin: IpcOrigin.Server,
-			relayClientId: client.clientId!,
-			data: { ...taskEvent, taskId: task.id },
-		})
+		if (taskEvent.eventName !== RooCodeEventName.Message) {
+			server.broadcast({
+				type: IpcMessageType.TaskEvent,
+				origin: IpcOrigin.Server,
+				relayClientId: client.clientId!,
+				data: { ...taskEvent, taskId: task.id },
+			})
+		}
 
 		if (!ignoreEvents.includes(eventName)) {
-			console.log(`[cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`)
-			console.log(payload)
+			console.log(
+				`${Date.now()} [cli#runExercise | ${language} / ${exercise}] taskEvent -> ${eventName}`,
+				payload,
+			)
 		}
 
 		if (eventName === RooCodeEventName.TaskStarted) {
@@ -279,11 +281,11 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 	})
 
 	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] disconnect`)
 		isClientDisconnected = true
 	})
 
-	console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
+	console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] starting task`)
 
 	client.sendMessage({
 		type: IpcMessageType.TaskCommand,
@@ -307,7 +309,7 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
+		console.log(`${Date.now()} [cli#runExercise | ${language} / ${exercise}] time limit reached`)
 
 		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
@@ -351,17 +353,56 @@ const runUnitTest = async ({ task }: { task: Task }) => {
 	let passed = true
 
 	for (const command of commands) {
-		const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
-
 		try {
-			const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] running "${command.join(" ")}"`,
+			)
+			const subprocess = execa({ cwd, shell: true, reject: false })`${command}`
+
+			const timeout = setTimeout(async () => {
+				const descendants = await new Promise<number[]>((resolve, reject) => {
+					psTree(subprocess.pid!, (err, children) => {
+						if (err) {
+							reject(err)
+						}
+
+						resolve(children.map((p) => parseInt(p.PID)))
+					})
+				})
+
+				if (descendants.length > 0) {
+					try {
+						console.log(
+							`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${descendants.join(" ")}`,
+						)
+
+						await execa`kill -9 ${descendants.join(" ")}`
+					} catch (error) {
+						console.error("Error killing descendant processes:", error)
+					}
+				}
+
+				console.log(
+					`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] killing ${subprocess.pid}`,
+				)
+
+				await execa`kill -9 ${subprocess.pid!}`
+			}, UNIT_TEST_TIMEOUT)
+
+			const result = await subprocess
+
+			console.log(
+				`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}] "${command.join(" ")}" result -> ${JSON.stringify(result)}`,
+			)
+
+			clearTimeout(timeout)
 
 			if (result.failed) {
 				passed = false
 				break
 			}
 		} catch (error) {
-			console.log("[cli#runUnitTest]", error)
+			console.log(`${Date.now()} [cli#runUnitTest | ${task.language} / ${task.exercise}]`, error)
 			passed = false
 			break
 		}
diff --git a/evals/apps/web/src/app/runs/[id]/run.tsx b/evals/apps/web/src/app/runs/[id]/run.tsx
index f9e1ac9f62d..84749fc9160 100644
--- a/evals/apps/web/src/app/runs/[id]/run.tsx
+++ b/evals/apps/web/src/app/runs/[id]/run.tsx
@@ -1,33 +1,44 @@
 "use client"
 
-import { useState, useRef } from "react"
-import { LoaderCircle, SquareTerminal } from "lucide-react"
+import { useMemo } from "react"
+import { LoaderCircle } from "lucide-react"
 
 import * as db from "@evals/db"
 
 import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { useRunStatus } from "@/hooks/use-run-status"
-import {
-	Drawer,
-	DrawerContent,
-	DrawerHeader,
-	DrawerTitle,
-	ScrollArea,
-	Table,
-	TableBody,
-	TableCell,
-	TableHead,
-	TableHeader,
-	TableRow,
-} from "@/components/ui"
+import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
 
 import { TaskStatus } from "./task-status"
 import { ConnectionStatus } from "./connection-status"
 
+type TaskMetrics = Pick<db.TaskMetrics, "tokensIn" | "tokensOut" | "tokensContext" | "duration" | "cost">
+
 export function Run({ run }: { run: db.Run }) {
-	const { tasks, status, output, outputCounts } = useRunStatus(run)
-	const scrollAreaRef = useRef<HTMLDivElement>(null)
-	const [selectedTask, setSelectedTask] = useState<db.Task>()
+	const { tasks, status, tokenUsage, usageUpdatedAt } = useRunStatus(run)
+
+	const taskMetrics: Record<number, TaskMetrics> = useMemo(() => {
+		const metrics: Record<number, TaskMetrics> = {}
+
+		tasks?.forEach((task) => {
+			const usage = tokenUsage.get(task.id)
+
+			if (task.finishedAt && task.taskMetrics) {
+				metrics[task.id] = task.taskMetrics
+			} else if (usage) {
+				metrics[task.id] = {
+					tokensIn: usage.totalTokensIn,
+					tokensOut: usage.totalTokensOut,
+					tokensContext: usage.contextTokens,
+					duration: usage.duration ?? 0,
+					cost: usage.totalCost,
+				}
+			}
+		})
+
+		return metrics
+		// eslint-disable-next-line react-hooks/exhaustive-deps
+	}, [tasks, tokenUsage, usageUpdatedAt])
 
 	return (
 		<>
@@ -57,38 +68,33 @@ export function Run({ run }: { run: db.Run }) {
 								<TableRow key={task.id}>
 									<TableCell>
 										<div className="flex items-center gap-2">
-											<TaskStatus task={task} />
+											<TaskStatus
+												task={task}
+												running={!!task.startedAt || !!tokenUsage.get(task.id)}
+											/>
 											<div>
 												{task.language}/{task.exercise}
 											</div>
-											{(outputCounts[task.id] ?? 0) > 0 && (
-												<div
-													className="flex items-center gap-1 cursor-pointer"
-													onClick={() => setSelectedTask(task)}>
-													<SquareTerminal className="size-4" />
-													<div className="font-mono text-xs text-foreground/50">
-														{outputCounts[task.id]}
-													</div>
-												</div>
-											)}
 										</div>
 									</TableCell>
-									{task.taskMetrics ? (
+									{taskMetrics[task.id] ? (
 										<>
 											<TableCell className="font-mono text-xs">
 												<div className="flex items-center justify-evenly">
-													<div>{formatTokens(task.taskMetrics.tokensIn)}</div>/
-													<div>{formatTokens(task.taskMetrics.tokensOut)}</div>
+													<div>{formatTokens(taskMetrics[task.id]!.tokensIn)}</div>/
+													<div>{formatTokens(taskMetrics[task.id]!.tokensOut)}</div>
 												</div>
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatTokens(task.taskMetrics.tokensContext)}
+												{formatTokens(taskMetrics[task.id]!.tokensContext)}
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatDuration(task.taskMetrics.duration)}
+												{taskMetrics[task.id]!.duration
+													? formatDuration(taskMetrics[task.id]!.duration)
+													: "-"}
 											</TableCell>
 											<TableCell className="font-mono text-xs">
-												{formatCurrency(task.taskMetrics.cost)}
+												{formatCurrency(taskMetrics[task.id]!.cost)}
 											</TableCell>
 										</>
 									) : (
@@ -100,27 +106,6 @@ export function Run({ run }: { run: db.Run }) {
 					</Table>
 				)}
 			</div>
-			<Drawer open={!!selectedTask} onOpenChange={() => setSelectedTask(undefined)}>
-				<DrawerContent>
-					<div className="mx-auto w-full max-w-2xl">
-						<DrawerHeader>
-							<DrawerTitle>
-								{selectedTask?.language}/{selectedTask?.exercise}
-							</DrawerTitle>
-						</DrawerHeader>
-						<div className="font-mono text-xs pb-12">
-							{selectedTask && (
-								<ScrollArea viewportRef={scrollAreaRef} className="h-96 rounded-sm border">
-									<div className="p-4">
-										<h4 className="mb-4 text-sm font-medium leading-none">Tags</h4>
-										{output.get(selectedTask.id)?.map((line, i) => <div key={i}>{line}</div>)}
-									</div>
-								</ScrollArea>
-							)}
-						</div>
-					</div>
-				</DrawerContent>
-			</Drawer>
 		</>
 	)
 }
diff --git a/evals/apps/web/src/app/runs/[id]/task-status.tsx b/evals/apps/web/src/app/runs/[id]/task-status.tsx
index 0c2ae4205d5..2e0b28b419f 100644
--- a/evals/apps/web/src/app/runs/[id]/task-status.tsx
+++ b/evals/apps/web/src/app/runs/[id]/task-status.tsx
@@ -4,16 +4,15 @@ import { type Task } from "@evals/db"
 
 type TaskStatusProps = {
 	task: Task
+	running: boolean
 }
 
-export const TaskStatus = ({ task }: TaskStatusProps) => {
+export const TaskStatus = ({ task, running }: TaskStatusProps) => {
 	return task.passed === false ? (
 		<CircleSlash className="size-4 text-destructive" />
 	) : task.passed === true ? (
 		<CircleCheck className="size-4 text-green-500" />
-	) : task.startedAt ? (
-		<LoaderCircle className="size-4 animate-spin" />
-	) : task.finishedAt ? (
+	) : running ? (
 		<LoaderCircle className="size-4 animate-spin" />
 	) : (
 		<CircleDashed className="size-4" />
diff --git a/evals/apps/web/src/app/runs/new/new-run.tsx b/evals/apps/web/src/app/runs/new/new-run.tsx
index 247441264a1..ad3f9d7228f 100644
--- a/evals/apps/web/src/app/runs/new/new-run.tsx
+++ b/evals/apps/web/src/app/runs/new/new-run.tsx
@@ -86,13 +86,25 @@ export function NewRun() {
 	const onSubmit = useCallback(
 		async (values: FormValues) => {
 			try {
+				if (mode === "openrouter") {
+					const openRouterModel = models.data?.find(({ id }) => id === model)
+
+					if (!openRouterModel) {
+						throw new Error("Model not found.")
+					}
+
+					const openRouterModelId = openRouterModel.id
+					const openRouterModelInfo = openRouterModel.modelInfo
+					values.settings = { ...(values.settings || {}), openRouterModelId, openRouterModelInfo }
+				}
+
 				const { id } = await createRun(values)
 				router.push(`/runs/${id}`)
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[router],
+		[mode, model, models.data, router],
 	)
 
 	const onFilterModels = useCallback(
diff --git a/evals/apps/web/src/hooks/use-process-tree.ts b/evals/apps/web/src/hooks/use-process-tree.ts
index 834e815f10c..35d7e7ce044 100644
--- a/evals/apps/web/src/hooks/use-process-tree.ts
+++ b/evals/apps/web/src/hooks/use-process-tree.ts
@@ -7,4 +7,5 @@ export const useProcessList = (pid: number | null) =>
 		queryKey: ["process-tree", pid],
 		queryFn: () => (pid ? getProcessList(pid) : []),
 		enabled: !!pid,
+		refetchInterval: 30_000,
 	})
diff --git a/evals/apps/web/src/hooks/use-run-status.ts b/evals/apps/web/src/hooks/use-run-status.ts
index a699dce38ee..3278e69e7a2 100644
--- a/evals/apps/web/src/hooks/use-run-status.ts
+++ b/evals/apps/web/src/hooks/use-run-status.ts
@@ -1,7 +1,7 @@
 import { useState, useCallback, useRef } from "react"
 import { useQuery, keepPreviousData } from "@tanstack/react-query"
 
-import { RooCodeEventName, taskEventSchema } from "@evals/types"
+import { RooCodeEventName, taskEventSchema, TokenUsage } from "@evals/types"
 import { Run } from "@evals/db"
 
 import { getTasks } from "@/lib/server/tasks"
@@ -9,14 +9,16 @@ import { useEventSource } from "@/hooks/use-event-source"
 
 export const useRunStatus = (run: Run) => {
 	const [tasksUpdatedAt, setTasksUpdatedAt] = useState<number>()
-	const outputRef = useRef<Map<number, string[]>>(new Map())
-	const [outputCounts, setOutputCounts] = useState<Record<number, number>>({})
+	const [usageUpdatedAt, setUsageUpdatedAt] = useState<number>()
+
+	const tokenUsage = useRef<Map<number, TokenUsage & { duration?: number }>>(new Map())
+	const startTimes = useRef<Map<number, number>>(new Map())
 
 	const { data: tasks } = useQuery({
 		queryKey: ["run", run.id, tasksUpdatedAt],
 		queryFn: async () => getTasks(run.id),
 		placeholderData: keepPreviousData,
-		refetchInterval: 10_000,
+		refetchInterval: 30_000,
 	})
 
 	const url = `/api/runs/${run.id}/stream`
@@ -47,28 +49,18 @@ export const useRunStatus = (run: Run) => {
 
 		switch (eventName) {
 			case RooCodeEventName.TaskStarted:
+				startTimes.current.set(taskId, Date.now())
+				break
 			case RooCodeEventName.TaskCompleted:
 			case RooCodeEventName.TaskAborted:
 				setTasksUpdatedAt(Date.now())
 				break
-			case RooCodeEventName.Message: {
-				const [
-					{
-						message: { text },
-					},
-				] = payload
-
-				if (text) {
-					outputRef.current.set(taskId, [...(outputRef.current.get(taskId) || []), text])
-					const outputCounts: Record<number, number> = {}
-
-					for (const [taskId, messages] of outputRef.current.entries()) {
-						outputCounts[taskId] = messages.length
-					}
-
-					setOutputCounts(outputCounts)
-				}
-
+			case RooCodeEventName.TaskTokenUsageUpdated: {
+				console.log("taskTokenUsageUpdated", payload)
+				const startTime = startTimes.current.get(taskId)
+				const duration = startTime ? Date.now() - startTime : undefined
+				tokenUsage.current.set(taskId, { ...payload[1], duration })
+				setUsageUpdatedAt(Date.now())
 				break
 			}
 		}
@@ -76,5 +68,10 @@ export const useRunStatus = (run: Run) => {
 
 	const status = useEventSource({ url, onMessage })
 
-	return { tasks, status, output: outputRef.current, outputCounts }
+	return {
+		status,
+		tasks,
+		tokenUsage: tokenUsage.current,
+		usageUpdatedAt,
+	}
 }
diff --git a/evals/packages/db/src/schema.ts b/evals/packages/db/src/schema.ts
index eb19de9fc09..522d5999fb1 100644
--- a/evals/packages/db/src/schema.ts
+++ b/evals/packages/db/src/schema.ts
@@ -2,7 +2,7 @@ import { sqliteTable, text, real, integer, blob, uniqueIndex } from "drizzle-orm
 import { relations } from "drizzle-orm"
 import { createInsertSchema } from "drizzle-zod"
 
-import { GlobalSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
+import { GlobalSettings, RooCodeSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * runs
@@ -13,7 +13,7 @@ export const runs = sqliteTable("runs", {
 	taskMetricsId: integer({ mode: "number" }).references(() => taskMetrics.id),
 	model: text().notNull(),
 	description: text(),
-	settings: blob({ mode: "json" }).$type<GlobalSettings>(),
+	settings: blob({ mode: "json" }).$type<RooCodeSettings>(),
 	pid: integer({ mode: "number" }),
 	socketPath: text().notNull(),
 	concurrency: integer({ mode: "number" }).default(2).notNull(),
diff --git a/evals/packages/ipc/src/client.ts b/evals/packages/ipc/src/client.ts
index 8b9c4c4b4b4..91e6b06cd0e 100644
--- a/evals/packages/ipc/src/client.ts
+++ b/evals/packages/ipc/src/client.ts
@@ -65,7 +65,7 @@ export class IpcClient extends EventEmitter<IpcClientEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 
 		if (!result.success) {
-			this.log("[client#onMessage] invalid payload", data)
+			this.log("[client#onMessage] invalid payload", result.error, data)
 			return
 		}
 
diff --git a/evals/packages/ipc/src/server.ts b/evals/packages/ipc/src/server.ts
index e4c0138566f..cbd9cf930d9 100644
--- a/evals/packages/ipc/src/server.ts
+++ b/evals/packages/ipc/src/server.ts
@@ -83,7 +83,7 @@ export class IpcServer extends EventEmitter<IpcServerEvents> {
 		const result = ipcMessageSchema.safeParse(data)
 
 		if (!result.success) {
-			this.log("[server#onMessage] invalid payload", result.error)
+			this.log("[server#onMessage] invalid payload", result.error, data)
 			return
 		}
 
diff --git a/evals/packages/types/src/roo-code-defaults.ts b/evals/packages/types/src/roo-code-defaults.ts
index f126f33ff0e..dd7ff85775c 100644
--- a/evals/packages/types/src/roo-code-defaults.ts
+++ b/evals/packages/types/src/roo-code-defaults.ts
@@ -2,25 +2,9 @@ import { RooCodeSettings } from "./roo-code.js"
 
 export const rooCodeDefaults: RooCodeSettings = {
 	apiProvider: "openrouter",
-	openRouterModelId: "google/gemini-2.0-flash-001", // "anthropic/claude-3.7-sonnet",
+	openRouterUseMiddleOutTransform: false,
 
-	// apiProvider: "openai",
-	// openAiBaseUrl: "http://hrudolph.duckdns.org:4269/api/v1",
-	// openAiApiKey: process.env.OPENAI_API_KEY,
-	// openAiModelId: "models/gemini-2.5-pro-exp-03-25",
-	// openAiCustomModelInfo: {
-	// 	maxTokens: 65536,
-	// 	contextWindow: 1000000,
-	// 	supportsImages: true,
-	// 	supportsPromptCache: false,
-	// 	inputPrice: 0,
-	// 	outputPrice: 0,
-	// 	description:
-	// 		"Gemini 2.5 Pro is Google’s state-of-the-art AI model designed for advanced reasoning, coding, mathematics, and scientific tasks. It employs “thinking” capabilities, enabling it to reason through responses with enhanced accuracy and nuanced context handling. Gemini 2.5 Pro achieves top-tier performance on multiple benchmarks, including first-place positioning on the LMArena leaderboard, reflecting superior human-preference alignment and complex problem-solving abilities.",
-	// 	thinking: false,
-	// },
-
-	modelTemperature: null,
+	// modelTemperature: null,
 	// reasoningEffort: "high",
 
 	pinnedApiConfigs: {},
@@ -60,12 +44,18 @@ export const rooCodeDefaults: RooCodeSettings = {
 	maxReadFileLine: 500,
 
 	terminalOutputLineLimit: 500,
-	terminalShellIntegrationTimeout: 15000,
+	terminalShellIntegrationTimeout: 30_000,
+	// terminalCommandDelay: 0,
+	// terminalPowershellCounter: false,
+	// terminalZshClearEolMark: true,
+	// terminalZshOhMy: true,
+	// terminalZshP10k: false,
+	// terminalZdotdir: true,
 
-	diffEnabled: true,
+	diffEnabled: false,
 	fuzzyMatchThreshold: 1.0,
 	experiments: {
-		search_and_replace: true,
+		search_and_replace: false,
 		insert_content: false,
 		powerSteering: false,
 	},
diff --git a/evals/packages/types/src/roo-code.ts b/evals/packages/types/src/roo-code.ts
index 5a4082395b8..7c982f29446 100644
--- a/evals/packages/types/src/roo-code.ts
+++ b/evals/packages/types/src/roo-code.ts
@@ -396,6 +396,7 @@ const providerSettingsRecord: ProviderSettingsRecord = {
 	apiModelId: undefined,
 	apiKey: undefined,
 	anthropicBaseUrl: undefined,
+	anthropicUseAuthToken: undefined,
 	// Glama
 	glamaModelId: undefined,
 	glamaModelInfo: undefined,
@@ -523,6 +524,12 @@ export const globalSettingsSchema = z.object({
 
 	terminalOutputLineLimit: z.number().optional(),
 	terminalShellIntegrationTimeout: z.number().optional(),
+	terminalCommandDelay: z.number().optional(),
+	terminalPowershellCounter: z.boolean().optional(),
+	terminalZshClearEolMark: z.boolean().optional(),
+	terminalZshOhMy: z.boolean().optional(),
+	terminalZshP10k: z.boolean().optional(),
+	terminalZdotdir: z.boolean().optional(),
 
 	diffEnabled: z.boolean().optional(),
 	fuzzyMatchThreshold: z.number().optional(),
@@ -592,6 +599,12 @@ const globalSettingsRecord: GlobalSettingsRecord = {
 
 	terminalOutputLineLimit: undefined,
 	terminalShellIntegrationTimeout: undefined,
+	terminalCommandDelay: undefined,
+	terminalPowershellCounter: undefined,
+	terminalZshClearEolMark: undefined,
+	terminalZshOhMy: undefined,
+	terminalZshP10k: undefined,
+	terminalZdotdir: undefined,
 
 	diffEnabled: undefined,
 	fuzzyMatchThreshold: undefined,
@@ -731,6 +744,7 @@ export const clineSays = [
 	"new_task",
 	"checkpoint_saved",
 	"rooignore_error",
+	"diff_error",
 ] as const
 
 export const clineSaySchema = z.enum(clineSays)
diff --git a/evals/pnpm-lock.yaml b/evals/pnpm-lock.yaml
index b50e3a3492c..536ad19e3f2 100644
--- a/evals/pnpm-lock.yaml
+++ b/evals/pnpm-lock.yaml
@@ -62,6 +62,9 @@ importers:
       p-wait-for:
         specifier: ^5.0.2
         version: 5.0.2
+      ps-tree:
+        specifier: ^1.2.0
+        version: 1.2.0
     devDependencies:
       '@evals/eslint-config':
         specifier: workspace:^
@@ -69,6 +72,9 @@ importers:
       '@evals/typescript-config':
         specifier: workspace:^
         version: link:../../config/typescript
+      '@types/ps-tree':
+        specifier: ^1.1.6
+        version: 1.1.6
 
   apps/web:
     dependencies:
diff --git a/evals/scripts/setup.sh b/evals/scripts/setup.sh
index ed66963542b..f58f80793e9 100755
--- a/evals/scripts/setup.sh
+++ b/evals/scripts/setup.sh
@@ -275,6 +275,25 @@ fi
 
 pnpm install --silent || exit 1
 
+if ! command -v code &>/dev/null; then
+  echo "⚠️ Visual Studio Code cli is not installed"
+  exit 1
+else
+  VSCODE_VERSION=$(code --version | head -n 1)
+  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
+fi
+
+# To reset VSCode:
+# rm -rvf ~/.vscode && rm -rvf ~/Library/Application\ Support/Code
+
+echo "🔌 Installing Visual Studio Code extensions..."
+code --install-extension golang.go &>/dev/null || exit 1
+code --install-extension dbaeumer.vscode-eslint&>/dev/null || exit 1
+code --install-extension redhat.java &>/dev/null || exit 1
+code --install-extension ms-python.python&>/dev/null || exit 1
+code --install-extension rust-lang.rust-analyzer &>/dev/null || exit 1
+code --install-extension rooveterinaryinc.roo-cline &>/dev/null || exit 1
+
 if [[ ! -d "../../evals" ]]; then
   if gh auth status &>/dev/null; then
     read -p "🔗 Would you like to be able to share eval results? (Y/n): " fork_evals
@@ -293,9 +312,9 @@ if [[ ! -s .env ]]; then
   cp .env.sample .env || exit 1
 fi
 
-echo "🗄️ Syncing database..."
-pnpm --filter @evals/db db:push || exit 1
-pnpm --filter @evals/db db:enable-wal || exit 1
+echo "🗄️ Syncing Roo Code evals database..."
+pnpm --filter @evals/db db:push &>/dev/null || exit 1
+pnpm --filter @evals/db db:enable-wal &>/dev/null || exit 1
 
 if ! grep -q "OPENROUTER_API_KEY" .env; then
   read -p "🔐 Enter your OpenRouter API key (sk-or-v1-...): " openrouter_api_key
@@ -304,14 +323,6 @@ if ! grep -q "OPENROUTER_API_KEY" .env; then
   echo "OPENROUTER_API_KEY=$openrouter_api_key" >> .env || exit 1
 fi
 
-if ! command -v code &>/dev/null; then
-  echo "⚠️ Visual Studio Code cli is not installed"
-  exit 1
-else
-  VSCODE_VERSION=$(code --version | head -n 1)
-  echo "✅ Visual Studio Code is installed ($VSCODE_VERSION)"
-fi
-
 if [[ ! -s "../bin/roo-code-latest.vsix" ]]; then
   build_extension
 else

From b4ba4cb87197af0edee1597501418c4a6fd76355 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Sun, 13 Apr 2025 00:56:49 -0700
Subject: [PATCH 2/2] Remove debugging

---
 evals/apps/web/src/hooks/use-run-status.ts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/evals/apps/web/src/hooks/use-run-status.ts b/evals/apps/web/src/hooks/use-run-status.ts
index 3278e69e7a2..1d463fc931d 100644
--- a/evals/apps/web/src/hooks/use-run-status.ts
+++ b/evals/apps/web/src/hooks/use-run-status.ts
@@ -56,7 +56,6 @@ export const useRunStatus = (run: Run) => {
 				setTasksUpdatedAt(Date.now())
 				break
 			case RooCodeEventName.TaskTokenUsageUpdated: {
-				console.log("taskTokenUsageUpdated", payload)
 				const startTime = startTimes.current.get(taskId)
 				const duration = startTime ? Date.now() - startTime : undefined
 				tokenUsage.current.set(taskId, { ...payload[1], duration })