Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions evals/apps/cli/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ const run = async (toolbox: GluegunToolbox) => {
run = await createRun({
model: rooCodeDefaults.openRouterModelId!,
pid: process.pid,
socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`),
socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID().slice(0, 8)}.sock`),
})

if (language === "all") {
Expand Down Expand Up @@ -101,7 +101,9 @@ const run = async (toolbox: GluegunToolbox) => {
console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
console.log(await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id} main`)
console.log(
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
)

fs.writeFileSync(
path.resolve(exercisesPath, "settings.json"),
Expand Down
256 changes: 107 additions & 149 deletions evals/apps/web/src/app/runs/new/new-run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import {
FormField,
FormItem,
FormLabel,
FormDescription,
FormMessage,
Textarea,
Tabs,
Expand All @@ -43,15 +42,11 @@ import {

import { SettingsDiff } from "./settings-diff"

const recommendedModels = [
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3.7-sonnet:thinking",
"google/gemini-2.0-flash-001",
]

export function NewRun() {
const router = useRouter()

const [mode, setMode] = useState<"openrouter" | "settings">("openrouter")

const [modelSearchValue, setModelSearchValue] = useState("")
const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
const modelSearchResultsRef = useRef<Map<string, number>>(new Map())
Expand Down Expand Up @@ -81,29 +76,15 @@ export function NewRun() {
const [model, suite, settings] = watch(["model", "suite", "settings"])

const onSubmit = useCallback(
async ({ settings, ...data }: FormValues) => {
async (values: FormValues) => {
try {
const openRouterModel = models.data?.find(({ id }) => id === data.model)

if (!openRouterModel) {
throw new Error(`Model not found: ${data.model}`)
}

const { id } = await createRun({
...data,
settings: {
...settings,
openRouterModelId: openRouterModel.id,
openRouterModelInfo: openRouterModel.modelInfo,
},
})

const { id } = await createRun(values)
router.push(`/runs/${id}`)
} catch (e) {
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
}
},
[router, models.data],
[router],
)

const onFilterModels = useCallback(
Expand Down Expand Up @@ -157,36 +138,25 @@ export function NewRun() {
.parse(JSON.parse(await file.text()))

const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {}

if (providerSettings.apiProvider === "openrouter" && providerSettings.openRouterModelId) {
const {
openRouterModelId,
modelMaxTokens,
modelMaxThinkingTokens,
modelTemperature,
includeMaxTokens,
} = providerSettings

const model = openRouterModelId

const settings = {
...rooCodeDefaults,
openRouterModelId,
modelMaxTokens,
modelMaxThinkingTokens,
modelTemperature,
includeMaxTokens,
...globalSettings,
}

setValue("model", model)
setValue("settings", settings)
} else {
setValue("settings", globalSettings)
const { apiProvider, openRouterModelId, openAiModelId } = providerSettings

switch (apiProvider) {
case "openrouter":
setValue("model", openRouterModelId ?? "")
break
case "openai":
setValue("model", openAiModelId ?? "")
break
default:
throw new Error(`Unsupported API provider: ${apiProvider}`)
}

setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings })
setMode("settings")

event.target.value = ""
} catch (e) {
console.error(e)
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
}
},
Expand All @@ -199,108 +169,96 @@ export function NewRun() {
<form
onSubmit={form.handleSubmit(onSubmit)}
className="flex flex-col justify-center divide-y divide-primary *:py-5">
<FormField
control={form.control}
name="model"
render={() => (
<FormItem>
<FormLabel>OpenRouter Model</FormLabel>
<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
<PopoverTrigger asChild>
<Button
variant="input"
role="combobox"
aria-expanded={modelPopoverOpen}
className="flex items-center justify-between">
<div>
{models.data?.find(({ id }) => id === model)?.name || model || "Select"}
</div>
<ChevronsUpDown className="opacity-50" />
</Button>
</PopoverTrigger>
<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
<Command filter={onFilterModels}>
<CommandInput
placeholder="Search"
value={modelSearchValue}
onValueChange={setModelSearchValue}
className="h-9"
/>
<CommandList>
<CommandEmpty>No model found.</CommandEmpty>
<CommandGroup>
{models.data?.map(({ id, name }) => (
<CommandItem key={id} value={id} onSelect={onSelectModel}>
{name}
<Check
className={cn(
"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
id === model ? "opacity-100" : "opacity-0",
)}
/>
</CommandItem>
))}
</CommandGroup>
</CommandList>
</Command>
</PopoverContent>
</Popover>
<FormMessage />
<FormDescription className="flex flex-wrap items-center gap-2">
<span>Recommended:</span>
{recommendedModels.map((modelId) => (
<Button
key={modelId}
variant="link"
className="break-all px-0!"
onClick={(e) => {
e.preventDefault()
setValue("model", modelId)
}}>
{modelId}
</Button>
))}
</FormDescription>
</FormItem>
<div className="flex flex-row justify-between gap-4">
{mode === "openrouter" && (
<FormField
control={form.control}
name="model"
render={() => (
<FormItem className="flex-1">
<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
<PopoverTrigger asChild>
<Button
variant="input"
role="combobox"
aria-expanded={modelPopoverOpen}
className="flex items-center justify-between">
<div>
{models.data?.find(({ id }) => id === model)?.name ||
model ||
"Select OpenRouter Model"}
</div>
<ChevronsUpDown className="opacity-50" />
</Button>
</PopoverTrigger>
<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
<Command filter={onFilterModels}>
<CommandInput
placeholder="Search"
value={modelSearchValue}
onValueChange={setModelSearchValue}
className="h-9"
/>
<CommandList>
<CommandEmpty>No model found.</CommandEmpty>
<CommandGroup>
{models.data?.map(({ id, name }) => (
<CommandItem
key={id}
value={id}
onSelect={onSelectModel}>
{name}
<Check
className={cn(
"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
id === model ? "opacity-100" : "opacity-0",
)}
/>
</CommandItem>
))}
</CommandGroup>
</CommandList>
</Command>
</PopoverContent>
</Popover>
<FormMessage />
</FormItem>
)}
/>
)}
/>

<FormItem>
<FormLabel>Import Settings</FormLabel>
<Button
type="button"
variant="secondary"
size="icon"
onClick={() => document.getElementById("json-upload")?.click()}>
<HardDriveUpload />
</Button>
<input
id="json-upload"
type="file"
accept="application/json"
className="hidden"
onChange={onImportSettings}
/>
{settings ? (
<ScrollArea className="max-h-64 border rounded-sm">
<>
<div className="flex items-center gap-1 p-2 border-b">
<CircleCheck className="size-4 text-ring" />
<div className="text-sm">
Imported valid Roo Code settings. Showing differences from default settings.
<FormItem className="flex-1">
<Button
type="button"
variant="secondary"
onClick={() => document.getElementById("json-upload")?.click()}>
<HardDriveUpload />
Import Settings
</Button>
<input
id="json-upload"
type="file"
accept="application/json"
className="hidden"
onChange={onImportSettings}
/>
{settings && (
<ScrollArea className="max-h-64 border rounded-sm">
<>
<div className="flex items-center gap-1 p-2 border-b">
<CircleCheck className="size-4 text-ring" />
<div className="text-sm">
Imported valid Roo Code settings. Showing differences from default
settings.
</div>
</div>
</div>
<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
</>
</ScrollArea>
) : (
<FormDescription>
Fully configure how Roo Code for this run using a settings file that was exported by Roo
Code.
</FormDescription>
)}
<FormMessage />
</FormItem>
<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
</>
</ScrollArea>
)}
<FormMessage />
</FormItem>
</div>

<FormField
control={form.control}
Expand Down
16 changes: 12 additions & 4 deletions evals/packages/db/drizzle.config.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import { defineConfig } from "drizzle-kit"

if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
}

const dialect = process.env.BENCHMARKS_DB_PATH ? "sqlite" : "turso"

const dbCredentials = process.env.BENCHMARKS_DB_PATH
? { url: process.env.BENCHMARKS_DB_PATH }
: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }

export default defineConfig({
out: "./drizzle",
schema: "./src/schema.ts",
dialect: "sqlite",
dbCredentials: {
url: process.env.BENCHMARKS_DB_PATH!,
},
dialect,
dbCredentials,
})
9 changes: 6 additions & 3 deletions evals/packages/db/src/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ import { drizzle } from "drizzle-orm/libsql"

import { schema } from "./schema.js"

const connection = {
url: process.env.BENCHMARKS_DB_PATH!,
concurrency: 50,
if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
}

const connection = process.env.BENCHMARKS_DB_PATH
? { url: process.env.BENCHMARKS_DB_PATH, concurrency: 50 }
: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }

export const db = drizzle({ schema, connection })
19 changes: 17 additions & 2 deletions evals/packages/db/src/queries/taskMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { eq } from "drizzle-orm"
import { eq, avg, min, max, and, isNotNull } from "drizzle-orm"

import { RecordNotFoundError, RecordNotCreatedError } from "./errors.js"
import type { InsertTaskMetrics, UpdateTaskMetrics } from "../schema.js"
import { insertTaskMetricsSchema, taskMetrics } from "../schema.js"
import { insertTaskMetricsSchema, taskMetrics, tasks, runs } from "../schema.js"
import { db } from "../db.js"

const table = taskMetrics
Expand Down Expand Up @@ -45,3 +45,18 @@ export const updateTaskMetrics = async (id: number, values: UpdateTaskMetrics) =

return record
}

export const successfulTaskDurations = async () => {
return db
.select({
runId: tasks.runId,
avgDuration: avg(taskMetrics.duration).mapWith(Number),
minDuration: min(taskMetrics.duration).mapWith(Number),
maxDuration: max(taskMetrics.duration).mapWith(Number),
})
.from(tasks)
.innerJoin(taskMetrics, eq(tasks.taskMetricsId, taskMetrics.id))
.innerJoin(runs, eq(tasks.runId, runs.id))
.where(and(eq(tasks.passed, true), isNotNull(runs.taskMetricsId)))
.groupBy(tasks.runId)
}
Loading