Skip to content

Commit 2eba534

Browse files
authored
Evals fixes (#2505)
* Allow Turso URLs, add support for API providers beyond OpenRouter * Make the git branch name unique
1 parent 15b91ab commit 2eba534

File tree

5 files changed

+146
-160
lines changed

5 files changed

+146
-160
lines changed

evals/apps/cli/src/index.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ const run = async (toolbox: GluegunToolbox) => {
7070
run = await createRun({
7171
model: rooCodeDefaults.openRouterModelId!,
7272
pid: process.pid,
73-
socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`),
73+
socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID().slice(0, 8)}.sock`),
7474
})
7575

7676
if (language === "all") {
@@ -101,7 +101,9 @@ const run = async (toolbox: GluegunToolbox) => {
101101
console.log(await execa({ cwd: exercisesPath })`git config user.email "[email protected]"`)
102102
console.log(await execa({ cwd: exercisesPath })`git checkout -f`)
103103
console.log(await execa({ cwd: exercisesPath })`git clean -fd`)
104-
console.log(await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id} main`)
104+
console.log(
105+
await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`,
106+
)
105107

106108
fs.writeFileSync(
107109
path.resolve(exercisesPath, "settings.json"),

evals/apps/web/src/app/runs/new/new-run.tsx

Lines changed: 107 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import {
2222
FormField,
2323
FormItem,
2424
FormLabel,
25-
FormDescription,
2625
FormMessage,
2726
Textarea,
2827
Tabs,
@@ -43,15 +42,11 @@ import {
4342

4443
import { SettingsDiff } from "./settings-diff"
4544

46-
const recommendedModels = [
47-
"anthropic/claude-3.7-sonnet",
48-
"anthropic/claude-3.7-sonnet:thinking",
49-
"google/gemini-2.0-flash-001",
50-
]
51-
5245
export function NewRun() {
5346
const router = useRouter()
5447

48+
const [mode, setMode] = useState<"openrouter" | "settings">("openrouter")
49+
5550
const [modelSearchValue, setModelSearchValue] = useState("")
5651
const [modelPopoverOpen, setModelPopoverOpen] = useState(false)
5752
const modelSearchResultsRef = useRef<Map<string, number>>(new Map())
@@ -81,29 +76,15 @@ export function NewRun() {
8176
const [model, suite, settings] = watch(["model", "suite", "settings"])
8277

8378
const onSubmit = useCallback(
84-
async ({ settings, ...data }: FormValues) => {
79+
async (values: FormValues) => {
8580
try {
86-
const openRouterModel = models.data?.find(({ id }) => id === data.model)
87-
88-
if (!openRouterModel) {
89-
throw new Error(`Model not found: ${data.model}`)
90-
}
91-
92-
const { id } = await createRun({
93-
...data,
94-
settings: {
95-
...settings,
96-
openRouterModelId: openRouterModel.id,
97-
openRouterModelInfo: openRouterModel.modelInfo,
98-
},
99-
})
100-
81+
const { id } = await createRun(values)
10182
router.push(`/runs/${id}`)
10283
} catch (e) {
10384
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
10485
}
10586
},
106-
[router, models.data],
87+
[router],
10788
)
10889

10990
const onFilterModels = useCallback(
@@ -157,36 +138,25 @@ export function NewRun() {
157138
.parse(JSON.parse(await file.text()))
158139

159140
const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {}
160-
161-
if (providerSettings.apiProvider === "openrouter" && providerSettings.openRouterModelId) {
162-
const {
163-
openRouterModelId,
164-
modelMaxTokens,
165-
modelMaxThinkingTokens,
166-
modelTemperature,
167-
includeMaxTokens,
168-
} = providerSettings
169-
170-
const model = openRouterModelId
171-
172-
const settings = {
173-
...rooCodeDefaults,
174-
openRouterModelId,
175-
modelMaxTokens,
176-
modelMaxThinkingTokens,
177-
modelTemperature,
178-
includeMaxTokens,
179-
...globalSettings,
180-
}
181-
182-
setValue("model", model)
183-
setValue("settings", settings)
184-
} else {
185-
setValue("settings", globalSettings)
141+
const { apiProvider, openRouterModelId, openAiModelId } = providerSettings
142+
143+
switch (apiProvider) {
144+
case "openrouter":
145+
setValue("model", openRouterModelId ?? "")
146+
break
147+
case "openai":
148+
setValue("model", openAiModelId ?? "")
149+
break
150+
default:
151+
throw new Error(`Unsupported API provider: ${apiProvider}`)
186152
}
187153

154+
setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings })
155+
setMode("settings")
156+
188157
event.target.value = ""
189158
} catch (e) {
159+
console.error(e)
190160
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
191161
}
192162
},
@@ -199,108 +169,96 @@ export function NewRun() {
199169
<form
200170
onSubmit={form.handleSubmit(onSubmit)}
201171
className="flex flex-col justify-center divide-y divide-primary *:py-5">
202-
<FormField
203-
control={form.control}
204-
name="model"
205-
render={() => (
206-
<FormItem>
207-
<FormLabel>OpenRouter Model</FormLabel>
208-
<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
209-
<PopoverTrigger asChild>
210-
<Button
211-
variant="input"
212-
role="combobox"
213-
aria-expanded={modelPopoverOpen}
214-
className="flex items-center justify-between">
215-
<div>
216-
{models.data?.find(({ id }) => id === model)?.name || model || "Select"}
217-
</div>
218-
<ChevronsUpDown className="opacity-50" />
219-
</Button>
220-
</PopoverTrigger>
221-
<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
222-
<Command filter={onFilterModels}>
223-
<CommandInput
224-
placeholder="Search"
225-
value={modelSearchValue}
226-
onValueChange={setModelSearchValue}
227-
className="h-9"
228-
/>
229-
<CommandList>
230-
<CommandEmpty>No model found.</CommandEmpty>
231-
<CommandGroup>
232-
{models.data?.map(({ id, name }) => (
233-
<CommandItem key={id} value={id} onSelect={onSelectModel}>
234-
{name}
235-
<Check
236-
className={cn(
237-
"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
238-
id === model ? "opacity-100" : "opacity-0",
239-
)}
240-
/>
241-
</CommandItem>
242-
))}
243-
</CommandGroup>
244-
</CommandList>
245-
</Command>
246-
</PopoverContent>
247-
</Popover>
248-
<FormMessage />
249-
<FormDescription className="flex flex-wrap items-center gap-2">
250-
<span>Recommended:</span>
251-
{recommendedModels.map((modelId) => (
252-
<Button
253-
key={modelId}
254-
variant="link"
255-
className="break-all px-0!"
256-
onClick={(e) => {
257-
e.preventDefault()
258-
setValue("model", modelId)
259-
}}>
260-
{modelId}
261-
</Button>
262-
))}
263-
</FormDescription>
264-
</FormItem>
172+
<div className="flex flex-row justify-between gap-4">
173+
{mode === "openrouter" && (
174+
<FormField
175+
control={form.control}
176+
name="model"
177+
render={() => (
178+
<FormItem className="flex-1">
179+
<Popover open={modelPopoverOpen} onOpenChange={setModelPopoverOpen}>
180+
<PopoverTrigger asChild>
181+
<Button
182+
variant="input"
183+
role="combobox"
184+
aria-expanded={modelPopoverOpen}
185+
className="flex items-center justify-between">
186+
<div>
187+
{models.data?.find(({ id }) => id === model)?.name ||
188+
model ||
189+
"Select OpenRouter Model"}
190+
</div>
191+
<ChevronsUpDown className="opacity-50" />
192+
</Button>
193+
</PopoverTrigger>
194+
<PopoverContent className="p-0 w-[var(--radix-popover-trigger-width)]">
195+
<Command filter={onFilterModels}>
196+
<CommandInput
197+
placeholder="Search"
198+
value={modelSearchValue}
199+
onValueChange={setModelSearchValue}
200+
className="h-9"
201+
/>
202+
<CommandList>
203+
<CommandEmpty>No model found.</CommandEmpty>
204+
<CommandGroup>
205+
{models.data?.map(({ id, name }) => (
206+
<CommandItem
207+
key={id}
208+
value={id}
209+
onSelect={onSelectModel}>
210+
{name}
211+
<Check
212+
className={cn(
213+
"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
214+
id === model ? "opacity-100" : "opacity-0",
215+
)}
216+
/>
217+
</CommandItem>
218+
))}
219+
</CommandGroup>
220+
</CommandList>
221+
</Command>
222+
</PopoverContent>
223+
</Popover>
224+
<FormMessage />
225+
</FormItem>
226+
)}
227+
/>
265228
)}
266-
/>
267229

268-
<FormItem>
269-
<FormLabel>Import Settings</FormLabel>
270-
<Button
271-
type="button"
272-
variant="secondary"
273-
size="icon"
274-
onClick={() => document.getElementById("json-upload")?.click()}>
275-
<HardDriveUpload />
276-
</Button>
277-
<input
278-
id="json-upload"
279-
type="file"
280-
accept="application/json"
281-
className="hidden"
282-
onChange={onImportSettings}
283-
/>
284-
{settings ? (
285-
<ScrollArea className="max-h-64 border rounded-sm">
286-
<>
287-
<div className="flex items-center gap-1 p-2 border-b">
288-
<CircleCheck className="size-4 text-ring" />
289-
<div className="text-sm">
290-
Imported valid Roo Code settings. Showing differences from default settings.
230+
<FormItem className="flex-1">
231+
<Button
232+
type="button"
233+
variant="secondary"
234+
onClick={() => document.getElementById("json-upload")?.click()}>
235+
<HardDriveUpload />
236+
Import Settings
237+
</Button>
238+
<input
239+
id="json-upload"
240+
type="file"
241+
accept="application/json"
242+
className="hidden"
243+
onChange={onImportSettings}
244+
/>
245+
{settings && (
246+
<ScrollArea className="max-h-64 border rounded-sm">
247+
<>
248+
<div className="flex items-center gap-1 p-2 border-b">
249+
<CircleCheck className="size-4 text-ring" />
250+
<div className="text-sm">
251+
Imported valid Roo Code settings. Showing differences from default
252+
settings.
253+
</div>
291254
</div>
292-
</div>
293-
<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
294-
</>
295-
</ScrollArea>
296-
) : (
297-
<FormDescription>
298-
Fully configure how Roo Code for this run using a settings file that was exported by Roo
299-
Code.
300-
</FormDescription>
301-
)}
302-
<FormMessage />
303-
</FormItem>
255+
<SettingsDiff defaultSettings={rooCodeDefaults} customSettings={settings} />
256+
</>
257+
</ScrollArea>
258+
)}
259+
<FormMessage />
260+
</FormItem>
261+
</div>
304262

305263
<FormField
306264
control={form.control}
Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
import { defineConfig } from "drizzle-kit"
22

3+
if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
4+
throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
5+
}
6+
7+
const dialect = process.env.BENCHMARKS_DB_PATH ? "sqlite" : "turso"
8+
9+
const dbCredentials = process.env.BENCHMARKS_DB_PATH
10+
? { url: process.env.BENCHMARKS_DB_PATH }
11+
: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }
12+
313
export default defineConfig({
414
out: "./drizzle",
515
schema: "./src/schema.ts",
6-
dialect: "sqlite",
7-
dbCredentials: {
8-
url: process.env.BENCHMARKS_DB_PATH!,
9-
},
16+
dialect,
17+
dbCredentials,
1018
})

evals/packages/db/src/db.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@ import { drizzle } from "drizzle-orm/libsql"
22

33
import { schema } from "./schema.js"
44

5-
const connection = {
6-
url: process.env.BENCHMARKS_DB_PATH!,
7-
concurrency: 50,
5+
if ((!process.env.TURSO_CONNECTION_URL || !process.env.TURSO_AUTH_TOKEN) && !process.env.BENCHMARKS_DB_PATH) {
6+
throw new Error("TURSO_CONNECTION_URL and TURSO_AUTH_TOKEN or BENCHMARKS_DB_PATH must be set")
87
}
98

9+
const connection = process.env.BENCHMARKS_DB_PATH
10+
? { url: process.env.BENCHMARKS_DB_PATH, concurrency: 50 }
11+
: { url: process.env.TURSO_CONNECTION_URL!, authToken: process.env.TURSO_AUTH_TOKEN! }
12+
1013
export const db = drizzle({ schema, connection })

evals/packages/db/src/queries/taskMetrics.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import { eq } from "drizzle-orm"
1+
import { eq, avg, min, max, and, isNotNull } from "drizzle-orm"
22

33
import { RecordNotFoundError, RecordNotCreatedError } from "./errors.js"
44
import type { InsertTaskMetrics, UpdateTaskMetrics } from "../schema.js"
5-
import { insertTaskMetricsSchema, taskMetrics } from "../schema.js"
5+
import { insertTaskMetricsSchema, taskMetrics, tasks, runs } from "../schema.js"
66
import { db } from "../db.js"
77

88
const table = taskMetrics
@@ -45,3 +45,18 @@ export const updateTaskMetrics = async (id: number, values: UpdateTaskMetrics) =
4545

4646
return record
4747
}
48+
49+
export const successfulTaskDurations = async () => {
50+
return db
51+
.select({
52+
runId: tasks.runId,
53+
avgDuration: avg(taskMetrics.duration).mapWith(Number),
54+
minDuration: min(taskMetrics.duration).mapWith(Number),
55+
maxDuration: max(taskMetrics.duration).mapWith(Number),
56+
})
57+
.from(tasks)
58+
.innerJoin(taskMetrics, eq(tasks.taskMetricsId, taskMetrics.id))
59+
.innerJoin(runs, eq(tasks.runId, runs.id))
60+
.where(and(eq(tasks.passed, true), isNotNull(runs.taskMetricsId)))
61+
.groupBy(tasks.runId)
62+
}

0 commit comments

Comments
 (0)