diff --git a/evals/apps/cli/src/index.ts b/evals/apps/cli/src/index.ts index 62829a4af0d..e050edead27 100644 --- a/evals/apps/cli/src/index.ts +++ b/evals/apps/cli/src/index.ts @@ -70,7 +70,7 @@ const run = async (toolbox: GluegunToolbox) => { run = await createRun({ model: rooCodeDefaults.openRouterModelId!, pid: process.pid, - socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID()}.sock`), + socketPath: path.resolve(os.tmpdir(), `roo-code-evals-${crypto.randomUUID().slice(0, 8)}.sock`), }) if (language === "all") { @@ -101,7 +101,9 @@ const run = async (toolbox: GluegunToolbox) => { console.log(await execa({ cwd: exercisesPath })`git config user.email "support@roocode.com"`) console.log(await execa({ cwd: exercisesPath })`git checkout -f`) console.log(await execa({ cwd: exercisesPath })`git clean -fd`) - console.log(await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id} main`) + console.log( + await execa({ cwd: exercisesPath })`git checkout -b runs/${run.id}-${crypto.randomUUID().slice(0, 8)} main`, + ) fs.writeFileSync( path.resolve(exercisesPath, "settings.json"), diff --git a/evals/apps/web/src/app/runs/new/new-run.tsx b/evals/apps/web/src/app/runs/new/new-run.tsx index fdfc85aca70..8c7266843df 100644 --- a/evals/apps/web/src/app/runs/new/new-run.tsx +++ b/evals/apps/web/src/app/runs/new/new-run.tsx @@ -22,7 +22,6 @@ import { FormField, FormItem, FormLabel, - FormDescription, FormMessage, Textarea, Tabs, @@ -43,15 +42,11 @@ import { import { SettingsDiff } from "./settings-diff" -const recommendedModels = [ - "anthropic/claude-3.7-sonnet", - "anthropic/claude-3.7-sonnet:thinking", - "google/gemini-2.0-flash-001", -] - export function NewRun() { const router = useRouter() + const [mode, setMode] = useState<"openrouter" | "settings">("openrouter") + const [modelSearchValue, setModelSearchValue] = useState("") const [modelPopoverOpen, setModelPopoverOpen] = useState(false) const modelSearchResultsRef = useRef>(new Map()) @@ -81,29 +76,15 @@ export function NewRun() { const [model, suite, settings] = watch(["model", "suite", "settings"]) const onSubmit = useCallback( - async ({ settings, ...data }: FormValues) => { + async (values: FormValues) => { try { - const openRouterModel = models.data?.find(({ id }) => id === data.model) - - if (!openRouterModel) { - throw new Error(`Model not found: ${data.model}`) - } - - const { id } = await createRun({ - ...data, - settings: { - ...settings, - openRouterModelId: openRouterModel.id, - openRouterModelInfo: openRouterModel.modelInfo, - }, - }) - + const { id } = await createRun(values) router.push(`/runs/${id}`) } catch (e) { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [router, models.data], + [router], ) const onFilterModels = useCallback( @@ -157,36 +138,25 @@ export function NewRun() { .parse(JSON.parse(await file.text())) const providerSettings = providerProfiles.apiConfigs[providerProfiles.currentApiConfigName] ?? {} - - if (providerSettings.apiProvider === "openrouter" && providerSettings.openRouterModelId) { - const { - openRouterModelId, - modelMaxTokens, - modelMaxThinkingTokens, - modelTemperature, - includeMaxTokens, - } = providerSettings - - const model = openRouterModelId - - const settings = { - ...rooCodeDefaults, - openRouterModelId, - modelMaxTokens, - modelMaxThinkingTokens, - modelTemperature, - includeMaxTokens, - ...globalSettings, - } - - setValue("model", model) - setValue("settings", settings) - } else { - setValue("settings", globalSettings) + const { apiProvider, openRouterModelId, openAiModelId } = providerSettings + + switch (apiProvider) { + case "openrouter": + setValue("model", openRouterModelId ?? "") + break + case "openai": + setValue("model", openAiModelId ?? "") + break + default: + throw new Error(`Unsupported API provider: ${apiProvider}`) } + setValue("settings", { ...rooCodeDefaults, ...providerSettings, ...globalSettings }) + setMode("settings") + event.target.value = "" } catch (e) { + console.error(e) toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, @@ -199,108 +169,96 @@ export function NewRun() {
- ( - - OpenRouter Model - - - - - - - - - No model found. - - {models.data?.map(({ id, name }) => ( - - {name} - - - ))} - - - - - - - - Recommended: - {recommendedModels.map((modelId) => ( - - ))} - - +
+ {mode === "openrouter" && ( + ( + + + + + + + + + + No model found. + + {models.data?.map(({ id, name }) => ( + + {name} + + + ))} + + + + + + + + )} + /> )} - /> - - Import Settings - - - {settings ? ( - - <> -
- -
- Imported valid Roo Code settings. Showing differences from default settings. + + + + {settings && ( + + <> +
+ +
+ Imported valid Roo Code settings. Showing differences from default + settings. +
-
- - - - ) : ( - - Fully configure how Roo Code for this run using a settings file that was exported by Roo - Code. - - )} - - + + + + )} + + +
{ + return db + .select({ + runId: tasks.runId, + avgDuration: avg(taskMetrics.duration).mapWith(Number), + minDuration: min(taskMetrics.duration).mapWith(Number), + maxDuration: max(taskMetrics.duration).mapWith(Number), + }) + .from(tasks) + .innerJoin(taskMetrics, eq(tasks.taskMetricsId, taskMetrics.id)) + .innerJoin(runs, eq(tasks.runId, runs.id)) + .where(and(eq(tasks.passed, true), isNotNull(runs.taskMetricsId))) + .groupBy(tasks.runId) +}