diff --git a/evals/apps/cli/src/index.ts b/evals/apps/cli/src/index.ts
index ea88ed9fcc1..78a9ad64379 100644
--- a/evals/apps/cli/src/index.ts
+++ b/evals/apps/cli/src/index.ts
@@ -33,13 +33,17 @@ import { IpcServer, IpcClient } from "@evals/ipc"
 import { __dirname, extensionDevelopmentPath, exercisesPath } from "./paths.js"
 import { getExercises } from "./exercises.js"
 
-const maxConcurrency = 2
-const taskTimeLimit = 5 * 60 * 1_000
+type TaskResult = { success: boolean; retry: boolean }
+type TaskPromise = Promise<TaskResult>
+
+const MAX_CONCURRENCY = 20
+const TASK_TIMEOUT = 10 * 60 * 1_000
+const UNIT_TEST_TIMEOUT = 60 * 1_000
 
 const testCommands: Record<ExerciseLanguage, { commands: string[]; timeout?: number; cwd?: string }> = {
 	go: { commands: ["go test"] }, // timeout 15s bash -c "cd '$dir' && go test > /dev/null 2>&1"
 	java: { commands: ["./gradlew test"] }, // timeout --foreground 15s bash -c "cd '$dir' && ./gradlew test > /dev/null 2>&1"
-	javascript: { commands: ["pnpm install", "pnpm test"], timeout: 30_000 }, // timeout 30s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
+	javascript: { commands: ["pnpm install", "pnpm test"] }, // timeout 15s bash -c "cd '$dir' && pnpm install >/dev/null 2>&1 && pnpm test >/dev/null 2>&1"
 	python: { commands: ["uv run python3 -m pytest -o markers=task *_test.py"] }, // timeout 15s bash -c "cd '$dir' && uv run python3 -m pytest -o markers=task *_test.py"
 	rust: { commands: ["cargo test"] }, // timeout 15s bash -c "cd '$dir' && cargo test > /dev/null 2>&1"
 }
@@ -107,40 +111,42 @@ const run = async (toolbox: GluegunToolbox) => {
 	const server = new IpcServer(run.socketPath, () => {})
 	server.listen()
 
-	// server.on(IpcMessageType.Connect, (clientId) => {
-	// 	server.send(clientId, {
-	// 		type: IpcMessageType.TaskEvent,
-	// 		origin: IpcOrigin.Server,
-	// 		data: { eventName: RooCodeEventName.Connect, taskId: -1 },
-	// 	})
-	// })
-
-	const runningPromises: Promise<void>[] = []
+	const runningPromises: TaskPromise[] = []
 
+	// Retries aren't implemented yet, but the return values are set up to
+	// support them.
 	const processTask = async (task: Task) => {
 		if (task.finishedAt === null) {
-			await runExercise({ run, task, server })
+			const { retry } = await runExercise({ run, task, server })
+
+			if (retry) {
+				return { success: false, retry: true }
+			}
 		}
 
 		if (task.passed === null) {
 			const passed = await runUnitTest({ task })
 			await updateTask(task.id, { passed })
+			return { success: passed, retry: false }
+		} else {
+			return { success: task.passed, retry: false }
 		}
 	}
 
-	for (const task of tasks) {
-		const taskPromise = processTask(task)
-		runningPromises.push(taskPromise)
+	const processTaskResult = async (task: Task, promise: TaskPromise) => {
+		const index = runningPromises.indexOf(promise)
 
-		taskPromise.finally(() => {
-			const index = runningPromises.indexOf(taskPromise)
+		if (index > -1) {
+			runningPromises.splice(index, 1)
+		}
+	}
 
-			if (index > -1) {
-				runningPromises.splice(index, 1)
-			}
-		})
+	for (const task of tasks) {
+		const promise = processTask(task)
+		runningPromises.push(promise)
+		promise.then(() => processTaskResult(task, promise))
 
-		if (runningPromises.length >= maxConcurrency) {
+		if (runningPromises.length > MAX_CONCURRENCY) {
 			await Promise.race(runningPromises)
 		}
 	}
@@ -148,89 +154,61 @@ const run = async (toolbox: GluegunToolbox) => {
 	await Promise.all(runningPromises)
 
 	const result = await finishRun(run.id)
-	try {
-		console.log("[cli#run]", result)
-		// eslint-disable-next-line @typescript-eslint/no-unused-vars
-	} catch (error) {
-		// console.error(error)
-	}
+	console.log("[cli#run]", result)
 
 	console.log(await execa({ cwd: exercisesPath })`git add .`)
 	console.log(await execa({ cwd: exercisesPath })`git commit -m ${`Run #${run.id}`} --no-verify`)
 }
 
-const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }) => {
+const runExercise = async ({ run, task, server }: { run: Run; task: Task; server: IpcServer }): TaskPromise => {
 	const { language, exercise } = task
 	const prompt = fs.readFileSync(path.resolve(exercisesPath, `prompts/${language}.md`), "utf-8")
 	const dirname = path.dirname(run.socketPath)
+	const workspacePath = path.resolve(exercisesPath, language, exercise)
 	const taskSocketPath = path.resolve(dirname, `${dirname}/task-${task.id}.sock`)
 
-	const controller = new AbortController()
-	const cancelSignal = controller.signal
-
 	// If debugging:
 	// Use --wait --log trace or --verbose.
-	const codeCommand = `code --disable-workspace-trust`
+	// Don't await execa and store result as subprocess.
+	// subprocess.stdout.pipe(process.stdout)
+
+	// Sleep for a random amount of time before opening a new VSCode window.
+	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * MAX_CONCURRENCY * 1_000))
+	console.log(`Opening new VS Code window at ${workspacePath}`)
 
 	await execa({
 		env: {
 			ROO_CODE_IPC_SOCKET_PATH: taskSocketPath,
 		},
 		shell: "/bin/bash",
-		cancelSignal,
-	})`${codeCommand} -n ${path.resolve(exercisesPath, language, exercise)}`
+	})`code --disable-workspace-trust -n ${workspacePath}`
 
-	// If debugging:
-	// Don't await execa and store result as subprocess.
-	// subprocess.stdout.pipe(process.stdout)
-
-	// Give VSCode some time to spawn before connectint to its unix socket.
-	await new Promise((resolve) => setTimeout(resolve, 1_000))
+	// Give VSCode some time to spawn before connecting to its unix socket.
+	await new Promise((resolve) => setTimeout(resolve, 1_000 + Math.random() * 4_000))
 	console.log(`Connecting to ${taskSocketPath}`)
+	const client = new IpcClient(taskSocketPath)
 
-	const createClient = (taskSocketPath: string) => {
-		const ipcClient = new IpcClient(taskSocketPath)
-
-		ipcClient.on(IpcMessageType.Ack, (ack) => {
-			console.log(`[cli#runExercise | ${language} / ${exercise}] ack`, ack)
-		})
-
-		return ipcClient
-	}
-
-	let tries = 0
-	let client = createClient(taskSocketPath)
-
-	while (++tries < 5) {
-		try {
-			await pWaitFor(() => client.isReady, { interval: 100, timeout: 5_000 })
-			break
-		} catch (error) {
-			console.error(error)
-			client.disconnect()
-			client = createClient(taskSocketPath)
-		}
+	try {
+		await pWaitFor(() => client.isReady, { interval: 250, timeout: 5_000 })
+		// eslint-disable-next-line @typescript-eslint/no-unused-vars
+	} catch (error) {
+		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
+		client.disconnect()
+		return { success: false, retry: false }
 	}
 
-	let isTaskFinished = false
+	let taskStartedAt = Date.now()
+	let taskFinishedAt: number | undefined
+	let taskMetricsId: number | undefined
+	let rooTaskId: string | undefined
 	let isClientDisconnected = false
 
-	client.on(IpcMessageType.Disconnect, async () => {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
-		isTaskFinished = true
-		isClientDisconnected = true
-	})
-
 	const ignoreEvents: RooCodeEventName[] = [
-		// RooCodeEventName.Message,
+		RooCodeEventName.Message,
 		RooCodeEventName.TaskTokenUsageUpdated,
 		RooCodeEventName.TaskAskResponded,
 	]
 
-	let taskStartedAt = Date.now()
-	let taskMetricsId: number | undefined
-	let rooTaskId: string | undefined
-
 	client.on(IpcMessageType.TaskEvent, async (taskEvent) => {
 		const { eventName, payload } = taskEvent
 
@@ -287,44 +265,43 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 		}
 
 		if (eventName === RooCodeEventName.TaskCompleted || eventName === RooCodeEventName.TaskAborted) {
+			taskFinishedAt = Date.now()
 			await updateTask(task.id, { finishedAt: new Date() })
-			isTaskFinished = true
 		}
 	})
 
-	if (client.isReady) {
-		client.sendMessage({
-			type: IpcMessageType.TaskCommand,
-			origin: IpcOrigin.Client,
-			clientId: client.clientId!,
+	client.on(IpcMessageType.Disconnect, async () => {
+		console.log(`[cli#runExercise | ${language} / ${exercise}] disconnect`)
+		isClientDisconnected = true
+	})
+
+	console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
+
+	client.sendMessage({
+		type: IpcMessageType.TaskCommand,
+		origin: IpcOrigin.Client,
+		clientId: client.clientId!,
+		data: {
+			commandName: TaskCommandName.StartNewTask,
 			data: {
-				commandName: TaskCommandName.StartNewTask,
-				data: {
-					configuration: {
-						...rooCodeDefaults,
-						openRouterApiKey: process.env.OPENROUTER_API_KEY!,
-						...run.settings,
-					},
-					text: prompt,
-					newTab: true,
+				configuration: {
+					...rooCodeDefaults,
+					openRouterApiKey: process.env.OPENROUTER_API_KEY!,
+					...run.settings,
 				},
+				text: prompt,
+				newTab: true,
 			},
-		})
-
-		console.log(`[cli#runExercise | ${language} / ${exercise}] starting task`)
-	} else {
-		console.log(`[cli#runExercise | ${language} / ${exercise}] unable to connect`)
-		client.disconnect()
-		isTaskFinished = true
-		isClientDisconnected = true
-	}
+		},
+	})
 
 	try {
-		await pWaitFor(() => isTaskFinished, { interval: 1_000, timeout: taskTimeLimit })
+		await pWaitFor(() => !!taskFinishedAt || isClientDisconnected, { interval: 1_000, timeout: TASK_TIMEOUT })
 		// eslint-disable-next-line @typescript-eslint/no-unused-vars
 	} catch (error) {
 		console.log(`[cli#runExercise | ${language} / ${exercise}] time limit reached`)
 
+		// Cancel the task.
 		if (rooTaskId && !isClientDisconnected) {
 			client.sendMessage({
 				type: IpcMessageType.TaskCommand,
@@ -333,35 +310,28 @@ const runExercise = async ({ run, task, server }: { run: Run; task: Task; server
 				data: { commandName: TaskCommandName.CancelTask, data: rooTaskId },
 			})
 
-			await new Promise((resolve) => setTimeout(resolve, 2_000))
+			// Give the server some time to cancel the task.
+			await new Promise((resolve) => setTimeout(resolve, 5_000))
 		}
 
+		// TODO: Notify clients that the task timed out.
 		await updateTask(task.id, { finishedAt: new Date() })
 	}
 
 	if (!isClientDisconnected) {
-		try {
-			if (rooTaskId) {
-				client.sendMessage({
-					type: IpcMessageType.TaskCommand,
-					origin: IpcOrigin.Client,
-					clientId: client.clientId!,
-					data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
-				})
-			}
-
-			client.disconnect()
-		} catch (error) {
-			console.error(error)
+		if (rooTaskId) {
+			client.sendMessage({
+				type: IpcMessageType.TaskCommand,
+				origin: IpcOrigin.Client,
+				clientId: client.clientId!,
+				data: { commandName: TaskCommandName.CloseTask, data: rooTaskId },
+			})
 		}
+
+		client.disconnect()
 	}
 
-	// try {
-	// 	console.log(`[cli#runExercise | ${language} / ${exercise}] aborting subprocess`)
-	// 	controller.abort()
-	// 	await subprocess
-	// } catch (error) {
-	// }
+	return { success: !!taskFinishedAt, retry: false }
 }
 
 const runUnitTest = async ({ task }: { task: Task }) => {
@@ -373,22 +343,17 @@ const runUnitTest = async ({ task }: { task: Task }) => {
 	let passed = true
 
 	for (const command of commands) {
-		// const controller = new AbortController()
-		// const cancelSignal = controller.signal
-		// const timeout = setTimeout(() => controller.abort(), cmd.timeout ?? 15_000)
+		const timeout = cmd.timeout ?? UNIT_TEST_TIMEOUT
 
 		try {
-			const result = await execa({ cwd, shell: true, reject: false /* , cancelSignal */ })`${command}`
-			// console.log('[cli#run] execa result =', { ...result, cwd, command })
-
-			// clearTimeout(timeout)
+			const result = await execa({ cwd, shell: true, reject: false, timeout })`${command}`
 
 			if (result.failed) {
 				passed = false
 				break
 			}
 		} catch (error) {
-			console.log("[cli#run] execa error =", error)
+			console.log("[cli#runUnitTest]", error)
 			passed = false
 			break
 		}
diff --git a/evals/apps/web/src/app/home.tsx b/evals/apps/web/src/app/home.tsx
index 5ca4734ce8c..c85c69897f0 100644
--- a/evals/apps/web/src/app/home.tsx
+++ b/evals/apps/web/src/app/home.tsx
@@ -1,14 +1,14 @@
 "use client"
 
+import { useMemo } from "react"
 import { useRouter } from "next/navigation"
-import { Rocket } from "lucide-react"
+import Link from "next/link"
+import { ChevronRight, Rocket } from "lucide-react"
 
 import type { Run, TaskMetrics } from "@evals/db"
 
-import { formatCurrency, formatDuration } from "@/lib"
+import { formatCurrency, formatDuration, formatTokens } from "@/lib"
 import { Button, Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"
-import { useMemo } from "react"
-import Link from "next/link"
 
 export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null })[] }) {
 	const router = useRouter()
@@ -20,32 +20,39 @@ export function Home({ runs }: { runs: (Run & { taskMetrics: TaskMetrics | null
 			<Table className="border border-t-0">
 				<TableHeader>
 					<TableRow>
-						<TableHead>ID</TableHead>
 						<TableHead>Model</TableHead>
-						<TableHead>Timestamp</TableHead>
 						<TableHead>Passed</TableHead>
 						<TableHead>Failed</TableHead>
 						<TableHead>% Correct</TableHead>
+						<TableHead className="text-center">Tokens In / Out</TableHead>
 						<TableHead>Cost</TableHead>
 						<TableHead>Duration</TableHead>
+						<TableHead />
 					</TableRow>
 				</TableHeader>
 				<TableBody>
 					{visibleRuns.length ? (
 						visibleRuns.map(({ taskMetrics, ...run }) => (
 							<TableRow key={run.id}>
-								<TableCell>
-									<Button variant="link" asChild>
-										<Link href={`/runs/${run.id}`}>{run.id}</Link>
-									</Button>
-								</TableCell>
 								<TableCell>{run.model}</TableCell>
-								<TableCell>{new Date(run.createdAt).toLocaleString()}</TableCell>
 								<TableCell>{run.passed}</TableCell>
 								<TableCell>{run.failed}</TableCell>
 								<TableCell>{((run.passed / (run.passed + run.failed)) * 100).toFixed(1)}%</TableCell>
+								<TableCell>
+									<div className="flex items-center justify-evenly">
+										<div>{formatTokens(taskMetrics!.tokensIn)}</div>/
+										<div>{formatTokens(taskMetrics!.tokensOut)}</div>
+									</div>
+								</TableCell>
 								<TableCell>{formatCurrency(taskMetrics!.cost)}</TableCell>
 								<TableCell>{formatDuration(taskMetrics!.duration)}</TableCell>
+								<TableCell>
+									<Button variant="ghost" size="icon" asChild>
+										<Link href={`/runs/${run.id}`}>
+											<ChevronRight />
+										</Link>
+									</Button>
+								</TableCell>
 							</TableRow>
 						))
 					) : (
diff --git a/evals/apps/web/src/app/layout.tsx b/evals/apps/web/src/app/layout.tsx
index 42ee8a7ecc1..b8751069783 100644
--- a/evals/apps/web/src/app/layout.tsx
+++ b/evals/apps/web/src/app/layout.tsx
@@ -11,7 +11,7 @@ const fontSans = Geist({ variable: "--font-sans", subsets: ["latin"] })
 const fontMono = Geist_Mono({ variable: "--font-mono", subsets: ["latin"] })
 
 export const metadata: Metadata = {
-	title: "Roo Code Benchmarks",
+	title: "Roo Code Evals",
 }
 
 export default function RootLayout({
diff --git a/evals/apps/web/src/app/runs/[id]/run.tsx b/evals/apps/web/src/app/runs/[id]/run.tsx
index d9683aea03c..f9e1ac9f62d 100644
--- a/evals/apps/web/src/app/runs/[id]/run.tsx
+++ b/evals/apps/web/src/app/runs/[id]/run.tsx
@@ -1,6 +1,6 @@
 "use client"
 
-import { useState, useRef, useEffect } from "react"
+import { useState, useRef } from "react"
 import { LoaderCircle, SquareTerminal } from "lucide-react"
 
 import * as db from "@evals/db"
@@ -13,7 +13,6 @@ import {
 	DrawerHeader,
 	DrawerTitle,
 	ScrollArea,
-	Separator,
 	Table,
 	TableBody,
 	TableCell,
@@ -30,19 +29,6 @@ export function Run({ run }: { run: db.Run }) {
 	const scrollAreaRef = useRef<HTMLDivElement>(null)
 	const [selectedTask, setSelectedTask] = useState<db.Task>()
 
-	useEffect(() => {
-		if (selectedTask) {
-			const scrollArea = scrollAreaRef.current
-
-			if (scrollArea) {
-				scrollArea.scrollTo({
-					top: scrollArea.scrollHeight,
-					behavior: "smooth",
-				})
-			}
-		}
-	}, [selectedTask, outputCounts])
-
 	return (
 		<>
 			<div>
@@ -51,7 +37,7 @@ export function Run({ run }: { run: db.Run }) {
 						<div>{run.model}</div>
 						{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
 					</div>
-					<ConnectionStatus status={status} pid={run.pid} />
+					{!run.taskMetricsId && <ConnectionStatus status={status} pid={run.pid} />}
 				</div>
 				{!tasks ? (
 					<LoaderCircle className="size-4 animate-spin" />
diff --git a/evals/apps/web/src/app/runs/new/new-run.tsx b/evals/apps/web/src/app/runs/new/new-run.tsx
index 739f5e18245..82dea2ffb4a 100644
--- a/evals/apps/web/src/app/runs/new/new-run.tsx
+++ b/evals/apps/web/src/app/runs/new/new-run.tsx
@@ -82,15 +82,29 @@ export function NewRun() {
 	const [model, suite, settings] = watch(["model", "suite", "settings"])
 
 	const onSubmit = useCallback(
-		async (data: FormValues) => {
+		async ({ settings, ...data }: FormValues) => {
 			try {
-				const { id } = await createRun(data)
+				const openRouterModel = models.data?.find(({ id }) => id === data.model)
+
+				if (!openRouterModel) {
+					throw new Error(`Model not found: ${data.model}`)
+				}
+
+				const { id } = await createRun({
+					...data,
+					settings: {
+						...settings,
+						openRouterModelId: openRouterModel.id,
+						openRouterModelInfo: openRouterModel.modelInfo,
+					},
+				})
+
 				router.push(`/runs/${id}`)
 			} catch (e) {
 				toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
 			}
 		},
-		[router],
+		[router, models.data],
 	)
 
 	const onFilterModels = useCallback(
diff --git a/evals/apps/web/src/hooks/use-open-router-models.ts b/evals/apps/web/src/hooks/use-open-router-models.ts
index 42b90bff887..fe4e2638a35 100644
--- a/evals/apps/web/src/hooks/use-open-router-models.ts
+++ b/evals/apps/web/src/hooks/use-open-router-models.ts
@@ -1,17 +1,41 @@
 import { z } from "zod"
 import { useQuery } from "@tanstack/react-query"
 
+import { type ModelInfo } from "@evals/types"
+
+const supportsPromptCache = ["anthropic/claude-3.7-sonnet", "anthropic/claude-3.5-sonnet", "anthropic/claude-3-5-haiku"]
+
+const supportsComputerUse = ["anthropic/claude-3.7-sonnet", "anthropic/claude-3.5-sonnet"]
+
+const supportsThinking = ["anthropic/claude-3.7-sonnet:thinking"]
+
+const parsePrice = (price?: string) => (price ? parseFloat(price) * 1_000_000 : undefined)
+
 export const openRouterModelSchema = z.object({
 	id: z.string(),
 	name: z.string(),
 	description: z.string(),
 	created: z.number(),
 	context_length: z.number(),
+	pricing: z.object({
+		prompt: z.string().optional(),
+		completion: z.string().optional(),
+	}),
+	top_provider: z
+		.object({
+			max_completion_tokens: z.number().nullish(),
+		})
+		.optional(),
+	architecture: z
+		.object({
+			modality: z.string(),
+		})
+		.optional(),
 })
 
-export type OpenRouterModel = z.infer<typeof openRouterModelSchema>
+export type OpenRouterModel = z.infer<typeof openRouterModelSchema> & { modelInfo: ModelInfo }
 
-export const getOpenRouterModels = async () => {
+export const getOpenRouterModels = async (): Promise<OpenRouterModel[]> => {
 	const response = await fetch("https://openrouter.ai/api/v1/models")
 
 	if (!response.ok) {
@@ -26,7 +50,22 @@ export const getOpenRouterModels = async () => {
 		return []
 	}
 
-	return result.data.data.sort((a, b) => a.name.localeCompare(b.name))
+	return result.data.data
+		.sort((a, b) => a.name.localeCompare(b.name))
+		.map((rawModel) => ({
+			...rawModel,
+			modelInfo: {
+				maxTokens: rawModel.top_provider?.max_completion_tokens ?? undefined,
+				contextWindow: rawModel.context_length,
+				supportsImages: rawModel.architecture?.modality?.includes("image"),
+				supportsPromptCache: supportsPromptCache.some((model) => rawModel.id.startsWith(model)),
+				supportsComputerUse: supportsComputerUse.some((model) => rawModel.id.startsWith(model)),
+				inputPrice: parsePrice(rawModel.pricing?.prompt),
+				outputPrice: parsePrice(rawModel.pricing?.completion),
+				description: rawModel.description,
+				thinking: supportsThinking.some((model) => rawModel.id.startsWith(model)),
+			},
+		}))
 }
 
 export const useOpenRouterModels = () =>
diff --git a/evals/apps/web/src/lib/format-tokens.ts b/evals/apps/web/src/lib/format-tokens.ts
index d017c9ce6a1..c51009478af 100644
--- a/evals/apps/web/src/lib/format-tokens.ts
+++ b/evals/apps/web/src/lib/format-tokens.ts
@@ -3,5 +3,13 @@ export const formatTokens = (tokens: number) => {
 		return tokens.toString()
 	}
 
-	return `${(tokens / 1000).toFixed(1)}k`
+	if (tokens < 1000000) {
+		return `${(tokens / 1000).toFixed(1)}k`
+	}
+
+	if (tokens < 1000000000) {
+		return `${(tokens / 1000000).toFixed(1)}M`
+	}
+
+	return `${(tokens / 1000000000).toFixed(1)}B`
 }
diff --git a/evals/apps/web/src/lib/schemas.ts b/evals/apps/web/src/lib/schemas.ts
index 6ceeb26984e..4869ef9186d 100644
--- a/evals/apps/web/src/lib/schemas.ts
+++ b/evals/apps/web/src/lib/schemas.ts
@@ -1,6 +1,6 @@
 import { z } from "zod"
 
-import { globalSettingsSchema } from "@evals/types"
+import { rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * CreateRun
@@ -12,7 +12,7 @@ export const createRunSchema = z
 		description: z.string().optional(),
 		suite: z.enum(["full", "partial"]),
 		exercises: z.array(z.string()).optional(),
-		settings: globalSettingsSchema.optional(),
+		settings: rooCodeSettingsSchema.optional(),
 	})
 	.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
 		message: "Exercises are required when running a partial suite.",
diff --git a/evals/packages/db/package.json b/evals/packages/db/package.json
index f3621a8032b..c140ffa0480 100644
--- a/evals/packages/db/package.json
+++ b/evals/packages/db/package.json
@@ -14,7 +14,8 @@
 		"db:pull": "pnpm drizzle-kit pull",
 		"db:check": "pnpm drizzle-kit check",
 		"db:up": "pnpm drizzle-kit up",
-		"db:studio": "pnpm drizzle-kit studio"
+		"db:studio": "pnpm drizzle-kit studio",
+		"db:enable-wal": "dotenvx run -f ../../.env -- tsx scripts/enable-wal.mts"
 	},
 	"dependencies": {
 		"@evals/types": "workspace:^",
diff --git a/evals/packages/db/scripts/enable-wal.mts b/evals/packages/db/scripts/enable-wal.mts
new file mode 100644
index 00000000000..a443c0d99ea
--- /dev/null
+++ b/evals/packages/db/scripts/enable-wal.mts
@@ -0,0 +1,23 @@
+import { db } from "../src/db.js"
+
+const main = async () => {
+	// Enable WAL mode for better performance and concurrency.
+	// https://til.simonwillison.net/sqlite/enabling-wal-mode
+	try {
+		const { rows } = await db.$client.execute("PRAGMA journal_mode = WAL;")
+		const row = rows[0]
+
+		if (row) {
+			console.log(`SQLite journal mode set to: ${row[0]}`)
+			process.exit(0)
+		} else {
+			console.error("Failed to enable WAL mode: no rows returned")
+			process.exit(1)
+		}
+	} catch (error) {
+		console.error(error)
+		process.exit(1)
+	}
+}
+
+main()
diff --git a/evals/packages/db/src/db.ts b/evals/packages/db/src/db.ts
index 042a2b95341..68c88539525 100644
--- a/evals/packages/db/src/db.ts
+++ b/evals/packages/db/src/db.ts
@@ -2,4 +2,9 @@ import { drizzle } from "drizzle-orm/libsql"
 
 import { schema } from "./schema.js"
 
-export const db = drizzle({ schema, connection: { url: process.env.BENCHMARKS_DB_PATH! } })
+const connection = {
+	url: process.env.BENCHMARKS_DB_PATH!,
+	concurrency: 50,
+}
+
+export const db = drizzle({ schema, connection })
diff --git a/evals/packages/db/src/schema.ts b/evals/packages/db/src/schema.ts
index 7c8ada7371d..02bc43e3cab 100644
--- a/evals/packages/db/src/schema.ts
+++ b/evals/packages/db/src/schema.ts
@@ -2,7 +2,7 @@ import { sqliteTable, text, real, integer, blob, uniqueIndex } from "drizzle-orm
 import { relations } from "drizzle-orm"
 import { createInsertSchema } from "drizzle-zod"
 
-import { GlobalSettings, exerciseLanguages, globalSettingsSchema } from "@evals/types"
+import { GlobalSettings, exerciseLanguages, rooCodeSettingsSchema } from "@evals/types"
 
 /**
  * runs
@@ -28,7 +28,7 @@ export const runsRelations = relations(runs, ({ one }) => ({
 export type Run = typeof runs.$inferSelect
 
 export const insertRunSchema = createInsertSchema(runs).omit({ id: true, createdAt: true }).extend({
-	settings: globalSettingsSchema.optional(),
+	settings: rooCodeSettingsSchema.optional(),
 })
 
 export type InsertRun = Omit<typeof runs.$inferInsert, "id" | "createdAt">
diff --git a/evals/packages/types/src/roo-code-defaults.ts b/evals/packages/types/src/roo-code-defaults.ts
index dc872ee13a8..940b9bfd87c 100644
--- a/evals/packages/types/src/roo-code-defaults.ts
+++ b/evals/packages/types/src/roo-code-defaults.ts
@@ -4,6 +4,22 @@ export const rooCodeDefaults: RooCodeSettings = {
 	apiProvider: "openrouter",
 	openRouterModelId: "google/gemini-2.0-flash-001", // "anthropic/claude-3.7-sonnet",
 
+	// apiProvider: "openai",
+	// openAiBaseUrl: "http://hrudolph.duckdns.org:4269/api/v1",
+	// openAiApiKey: process.env.OPENAI_API_KEY,
+	// openAiModelId: "models/gemini-2.5-pro-exp-03-25",
+	// openAiCustomModelInfo: {
+	// 	maxTokens: 65536,
+	// 	contextWindow: 1000000,
+	// 	supportsImages: true,
+	// 	supportsPromptCache: false,
+	// 	inputPrice: 0,
+	// 	outputPrice: 0,
+	// 	description:
+	// 		"Gemini 2.5 Pro is Google’s state-of-the-art AI model designed for advanced reasoning, coding, mathematics, and scientific tasks. It employs “thinking” capabilities, enabling it to reason through responses with enhanced accuracy and nuanced context handling. Gemini 2.5 Pro achieves top-tier performance on multiple benchmarks, including first-place positioning on the LMArena leaderboard, reflecting superior human-preference alignment and complex problem-solving abilities.",
+	// 	thinking: false,
+	// },
+
 	pinnedApiConfigs: {},
 	lastShownAnnouncementId: "mar-20-2025-3-10",
 
@@ -47,7 +63,6 @@ export const rooCodeDefaults: RooCodeSettings = {
 	diffEnabled: true,
 	fuzzyMatchThreshold: 1.0,
 	experiments: {
-		multi_search_and_replace: false,
 		search_and_replace: true,
 		insert_content: false,
 		powerSteering: false,
diff --git a/evals/packages/types/src/roo-code.ts b/evals/packages/types/src/roo-code.ts
index 2106d440940..6d582dbd555 100644
--- a/evals/packages/types/src/roo-code.ts
+++ b/evals/packages/types/src/roo-code.ts
@@ -270,12 +270,7 @@ export type CustomSupportPrompts = z.infer<typeof customSupportPromptsSchema>
  * ExperimentId
  */
 
-export const experimentIds = [
-	"search_and_replace",
-	"insert_content",
-	"powerSteering",
-	"multi_search_and_replace",
-] as const
+export const experimentIds = ["search_and_replace", "insert_content", "powerSteering"] as const
 
 export const experimentIdsSchema = z.enum(experimentIds)
 
@@ -289,7 +284,6 @@ const experimentsSchema = z.object({
 	search_and_replace: z.boolean(),
 	insert_content: z.boolean(),
 	powerSteering: z.boolean(),
-	multi_search_and_replace: z.boolean(),
 })
 
 export type Experiments = z.infer<typeof experimentsSchema>
diff --git a/evals/scripts/setup.sh b/evals/scripts/setup.sh
index 7d5e1bc3579..d36f4f8f4fc 100755
--- a/evals/scripts/setup.sh
+++ b/evals/scripts/setup.sh
@@ -296,6 +296,7 @@ fi
 if [[ ! -s /tmp/evals.db ]]; then
   echo "🗄️ Creating database..."
   pnpm --filter @evals/db db:push || exit 1
+  pnpm --filter @evals/db db:enable-wal || exit 1
 fi
 
 if ! grep -q "OPENROUTER_API_KEY" .env; then
diff --git a/src/core/Cline.ts b/src/core/Cline.ts
index c7e9fe66a55..b8f7f9ae9ef 100644
--- a/src/core/Cline.ts
+++ b/src/core/Cline.ts
@@ -105,6 +105,7 @@ export type ClineOptions = {
 	enableCheckpoints?: boolean
 	checkpointStorage?: CheckpointStorage
 	fuzzyMatchThreshold?: number
+	consecutiveMistakeLimit?: number
 	task?: string
 	images?: string[]
 	historyItem?: HistoryItem
@@ -135,7 +136,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 	customInstructions?: string
 	diffStrategy?: DiffStrategy
 	diffEnabled: boolean = false
-	fuzzyMatchThreshold: number = 1.0
+	fuzzyMatchThreshold: number
 
 	apiConversationHistory: (Anthropic.MessageParam & { ts?: number })[] = []
 	clineMessages: ClineMessage[] = []
@@ -144,10 +145,11 @@ export class Cline extends EventEmitter<ClineEvents> {
 	private askResponseText?: string
 	private askResponseImages?: string[]
 	private lastMessageTs?: number
-	// Not private since it needs to be accessible by tools
+	// Not private since it needs to be accessible by tools.
 	consecutiveMistakeCount: number = 0
+	consecutiveMistakeLimit: number
 	consecutiveMistakeCountForApplyDiff: Map<string, number> = new Map()
-	// Not private since it needs to be accessible by tools
+	// Not private since it needs to be accessible by tools.
 	providerRef: WeakRef<ClineProvider>
 	private abort: boolean = false
 	didFinishAbortingStream = false
@@ -178,10 +180,11 @@ export class Cline extends EventEmitter<ClineEvents> {
 		provider,
 		apiConfiguration,
 		customInstructions,
-		enableDiff,
+		enableDiff = false,
 		enableCheckpoints = true,
 		checkpointStorage = "task",
-		fuzzyMatchThreshold,
+		fuzzyMatchThreshold = 1.0,
+		consecutiveMistakeLimit = 3,
 		task,
 		images,
 		historyItem,
@@ -189,7 +192,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 		startTask = true,
 		rootTask,
 		parentTask,
-		taskNumber,
+		taskNumber = -1,
 		onCreated,
 	}: ClineOptions) {
 		super()
@@ -211,8 +214,9 @@ export class Cline extends EventEmitter<ClineEvents> {
 		this.urlContentFetcher = new UrlContentFetcher(provider.context)
 		this.browserSession = new BrowserSession(provider.context)
 		this.customInstructions = customInstructions
-		this.diffEnabled = enableDiff ?? false
-		this.fuzzyMatchThreshold = fuzzyMatchThreshold ?? 1.0
+		this.diffEnabled = enableDiff
+		this.fuzzyMatchThreshold = fuzzyMatchThreshold
+		this.consecutiveMistakeLimit = consecutiveMistakeLimit
 		this.providerRef = new WeakRef(provider)
 		this.diffViewProvider = new DiffViewProvider(this.cwd)
 		this.enableCheckpoints = enableCheckpoints
@@ -220,7 +224,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 
 		this.rootTask = rootTask
 		this.parentTask = parentTask
-		this.taskNumber = taskNumber ?? -1
+		this.taskNumber = taskNumber
 
 		if (historyItem) {
 			telemetryService.captureTaskRestarted(this.taskId)
@@ -1718,7 +1722,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 			throw new Error(`[Cline#recursivelyMakeClineRequests] task ${this.taskId}.${this.instanceId} aborted`)
 		}
 
-		if (this.consecutiveMistakeCount >= 3) {
+		if (this.consecutiveMistakeCount >= this.consecutiveMistakeLimit) {
 			const { response, text, images } = await this.ask(
 				"mistake_limit_reached",
 				this.api.getModel().id.includes("claude")
diff --git a/src/core/webview/ClineProvider.ts b/src/core/webview/ClineProvider.ts
index c31b77a5e1f..46df21ed103 100644
--- a/src/core/webview/ClineProvider.ts
+++ b/src/core/webview/ClineProvider.ts
@@ -447,10 +447,29 @@ export class ClineProvider extends EventEmitter<ClineProviderEvents> implements
 		return this.initClineWithTask(task, images, parent)
 	}
 
-	// when initializing a new task, (not from history but from a tool command new_task) there is no need to remove the previouse task
-	// since the new task is a sub task of the previous one, and when it finishes it is removed from the stack and the caller is resumed
-	// in this way we can have a chain of tasks, each one being a sub task of the previous one until the main task is finished
-	public async initClineWithTask(task?: string, images?: string[], parentTask?: Cline) {
+	// When initializing a new task, (not from history but from a tool command
+	// new_task) there is no need to remove the previouse task since the new
+	// task is a subtask of the previous one, and when it finishes it is removed
+	// from the stack and the caller is resumed in this way we can have a chain
+	// of tasks, each one being a sub task of the previous one until the main
+	// task is finished.
+	public async initClineWithTask(
+		task?: string,
+		images?: string[],
+		parentTask?: Cline,
+		options: Partial<
+			Pick<
+				ClineOptions,
+				| "customInstructions"
+				| "enableDiff"
+				| "enableCheckpoints"
+				| "checkpointStorage"
+				| "fuzzyMatchThreshold"
+				| "consecutiveMistakeLimit"
+				| "experiments"
+			>
+		> = {},
+	) {
 		const {
 			apiConfiguration,
 			customModePrompts,
@@ -481,12 +500,15 @@ export class ClineProvider extends EventEmitter<ClineProviderEvents> implements
 			parentTask,
 			taskNumber: this.clineStack.length + 1,
 			onCreated: (cline) => this.emit("clineCreated", cline),
+			...options,
 		})
 
 		await this.addClineToStack(cline)
+
 		this.log(
 			`[subtasks] ${cline.parentTask ? "child" : "parent"} task ${cline.taskId}.${cline.instanceId} instantiated`,
 		)
+
 		return cline
 	}
 
diff --git a/src/exports/api.ts b/src/exports/api.ts
index b1061cb56ae..dc20e719a5e 100644
--- a/src/exports/api.ts
+++ b/src/exports/api.ts
@@ -1,11 +1,14 @@
 import { EventEmitter } from "events"
 import * as vscode from "vscode"
+import fs from "fs/promises"
+import * as path from "path"
 
+import { getWorkspacePath } from "../utils/path"
 import { ClineProvider } from "../core/webview/ClineProvider"
 import { openClineInNewTab } from "../activate/registerCommands"
-
 import { RooCodeSettings, RooCodeEvents, RooCodeEventName, ClineMessage } from "../schemas"
 import { IpcOrigin, IpcMessageType, TaskCommandName, TaskEvent } from "../schemas/ipc"
+
 import { RooCodeAPI } from "./interface"
 import { IpcServer } from "./ipc"
 import { outputChannelLog } from "./log"
@@ -18,6 +21,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 	private readonly ipc?: IpcServer
 	private readonly taskMap = new Map<string, ClineProvider>()
 	private readonly log: (...args: unknown[]) => void
+	private logfile?: string
 
 	constructor(
 		outputChannel: vscode.OutputChannel,
@@ -31,12 +35,16 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		this.sidebarProvider = provider
 		this.context = provider.context
 
-		this.log = enableLogging
-			? (...args: unknown[]) => {
-					outputChannelLog(this.outputChannel, ...args)
-					console.log(args)
-				}
-			: () => {}
+		if (enableLogging) {
+			this.log = (...args: unknown[]) => {
+				outputChannelLog(this.outputChannel, ...args)
+				console.log(args)
+			}
+
+			this.logfile = path.join(getWorkspacePath(), "roo-code-messages.log")
+		} else {
+			this.log = () => {}
+		}
 
 		this.registerListeners(this.sidebarProvider)
 
@@ -89,6 +97,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		let provider: ClineProvider
 
 		if (newTab) {
+			await vscode.commands.executeCommand("workbench.action.files.revert")
 			await vscode.commands.executeCommand("workbench.action.closeAllEditors")
 
 			if (!this.tabProvider) {
@@ -116,7 +125,10 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		await provider.postMessageToWebview({ type: "action", action: "chatButtonClicked" })
 		await provider.postMessageToWebview({ type: "invoke", invoke: "newChat", text, images })
 
-		const { taskId } = await provider.initClineWithTask(text, images)
+		const { taskId } = await provider.initClineWithTask(text, images, undefined, {
+			consecutiveMistakeLimit: Number.MAX_SAFE_INTEGER,
+		})
+
 		return taskId
 	}
 
@@ -163,8 +175,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		await this.sidebarProvider.postStateToWebview()
 	}
 
-	public async createProfile(name: string): Promise<string> {
-		// Input validation
+	public async createProfile(name: string) {
 		if (!name || !name.trim()) {
 			throw new Error("Profile name cannot be empty")
 		}
@@ -176,32 +187,33 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 			throw new Error(`A profile with the name "${name}" already exists`)
 		}
 
-		// Generate unique ID and create profile
 		const id = this.sidebarProvider.providerSettingsManager.generateId()
-		const newProfile = {
-			id,
-			name: name.trim(),
-			apiProvider: "openai" as const, // Type assertion for better type safety
-		}
 
-		// Update configuration with new profile
 		await this.setConfiguration({
 			...currentSettings,
-			listApiConfigMeta: [...profiles, newProfile],
+			listApiConfigMeta: [
+				...profiles,
+				{
+					id,
+					name: name.trim(),
+					apiProvider: "openai" as const,
+				},
+			],
 		})
+
 		return id
 	}
 
-	public getProfiles(): string[] {
-		const profiles = this.getConfiguration().listApiConfigMeta || []
-		return profiles.map((profile) => profile.name)
+	public getProfiles() {
+		return (this.getConfiguration().listApiConfigMeta || []).map((profile) => profile.name)
 	}
 
-	public async setActiveProfile(name: string): Promise<void> {
+	public async setActiveProfile(name: string) {
 		const currentSettings = this.getConfiguration()
 		const profiles = currentSettings.listApiConfigMeta || []
 
 		const profile = profiles.find((p) => p.name === name)
+
 		if (!profile) {
 			throw new Error(`Profile with name "${name}" does not exist`)
 		}
@@ -212,14 +224,15 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		})
 	}
 
-	public getActiveProfile(): string | undefined {
+	public getActiveProfile() {
 		return this.getConfiguration().currentApiConfigName
 	}
 
-	public async deleteProfile(name: string): Promise<void> {
+	public async deleteProfile(name: string) {
 		const currentSettings = this.getConfiguration()
 		const profiles = currentSettings.listApiConfigMeta || []
 		const targetIndex = profiles.findIndex((p) => p.name === name)
+
 		if (targetIndex === -1) {
 			throw new Error(`Profile with name "${name}" does not exist`)
 		}
@@ -227,7 +240,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 		const profileToDelete = profiles[targetIndex]
 		profiles.splice(targetIndex, 1)
 
-		// If we're deleting the active profile, clear the currentApiConfigName
+		// If we're deleting the active profile, clear the currentApiConfigName.
 		const newSettings: RooCodeSettings = {
 			...currentSettings,
 			listApiConfigMeta: profiles,
@@ -236,6 +249,7 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 					? undefined
 					: currentSettings.currentApiConfigName,
 		}
+
 		await this.setConfiguration(newSettings)
 	}
 
@@ -245,12 +259,19 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 
 	private registerListeners(provider: ClineProvider) {
 		provider.on("clineCreated", (cline) => {
-			cline.on("taskStarted", () => {
+			cline.on("taskStarted", async () => {
 				this.emit(RooCodeEventName.TaskStarted, cline.taskId)
 				this.taskMap.set(cline.taskId, provider)
+				await this.fileLog(`[${new Date().toISOString()}] taskStarted -> ${cline.taskId}\n`)
 			})
 
-			cline.on("message", (message) => this.emit(RooCodeEventName.Message, { taskId: cline.taskId, ...message }))
+			cline.on("message", async (message) => {
+				this.emit(RooCodeEventName.Message, { taskId: cline.taskId, ...message })
+
+				if (message.message.partial !== true) {
+					await this.fileLog(`[${new Date().toISOString()}] ${JSON.stringify(message.message, null, 2)}\n`)
+				}
+			})
 
 			cline.on("taskModeSwitched", (taskId, mode) => this.emit(RooCodeEventName.TaskModeSwitched, taskId, mode))
 
@@ -265,9 +286,13 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 				this.taskMap.delete(cline.taskId)
 			})
 
-			cline.on("taskCompleted", (_, usage) => {
+			cline.on("taskCompleted", async (_, usage) => {
 				this.emit(RooCodeEventName.TaskCompleted, cline.taskId, usage)
 				this.taskMap.delete(cline.taskId)
+
+				await this.fileLog(
+					`[${new Date().toISOString()}] taskCompleted -> ${cline.taskId} | ${JSON.stringify(usage, null, 2)}\n`,
+				)
 			})
 
 			cline.on("taskSpawned", (childTaskId) => this.emit(RooCodeEventName.TaskSpawned, cline.taskId, childTaskId))
@@ -277,4 +302,16 @@ export class API extends EventEmitter<RooCodeEvents> implements RooCodeAPI {
 			this.emit(RooCodeEventName.TaskCreated, cline.taskId)
 		})
 	}
+
+	private async fileLog(message: string) {
+		if (!this.logfile) {
+			return
+		}
+
+		try {
+			await fs.appendFile(this.logfile, message, "utf8")
+		} catch (_) {
+			this.logfile = undefined
+		}
+	}
 }