diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index 2eae1f6804..b89d404ab7 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -21,11 +21,21 @@ import { CreateRun } from "@/lib/schemas" const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") -// eslint-disable-next-line @typescript-eslint/no-unused-vars -export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) { +export async function createRun({ + suite, + exercises = [], + timeout, + contextWindow, + pricePerMillionInputTokens, + pricePerMillionOutputTokens, + ...values +}: CreateRun & { contextWindow?: number; pricePerMillionInputTokens?: number; pricePerMillionOutputTokens?: number }) { const run = await _createRun({ ...values, timeout, + contextWindow, + pricePerMillionInputTokens, + pricePerMillionOutputTokens, socketPath: "", // TODO: Get rid of this. }) diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx index b6c5290b13..95792f16d4 100644 --- a/apps/web-evals/src/app/runs/[id]/run.tsx +++ b/apps/web-evals/src/app/runs/[id]/run.tsx @@ -47,6 +47,15 @@ export function Run({ run }: { run: Run }) {
{run.model}
+
+ {run.contextWindow && Context: {(run.contextWindow / 1000).toFixed(0)}k tokens} + {(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && ( + + Pricing: ${run.pricePerMillionInputTokens?.toFixed(2) || "?"} / $ + {run.pricePerMillionOutputTokens?.toFixed(2) || "?"} per 1M tokens + + )} +
{run.description &&
{run.description}
}
{!run.taskMetricsId && } diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx index 41d35f3c4c..7f2c39263b 100644 --- a/apps/web-evals/src/app/runs/new/new-run.tsx +++ b/apps/web-evals/src/app/runs/new/new-run.tsx @@ -26,7 +26,7 @@ import { TIMEOUT_DEFAULT, } from "@/lib/schemas" import { cn } from "@/lib/utils" -import { useOpenRouterModels } from "@/hooks/use-open-router-models" +import { useOpenRouterModels, getModelDetails, getPricingPerMillion } from "@/hooks/use-open-router-models" import { Button, FormControl, @@ -95,6 +95,21 @@ export function NewRun() { try { if (mode === "openrouter") { values.settings = { ...(values.settings || {}), openRouterModelId: model } + + // Get model details and add to the run + const modelDetails = getModelDetails(models.data, model) + if (modelDetails) { + const pricing = getPricingPerMillion(modelDetails.pricing) + const extendedValues = { + ...values, + contextWindow: modelDetails.context_length, + pricePerMillionInputTokens: pricing.input, + pricePerMillionOutputTokens: pricing.output, + } + const { id } = await createRun(extendedValues) + router.push(`/runs/${id}`) + return + } } const { id } = await createRun(values) @@ -103,7 +118,7 @@ export function NewRun() { toast.error(e instanceof Error ? e.message : "An unknown error occurred.") } }, - [mode, model, router], + [mode, model, models.data, router], ) const onFilterModels = useCallback( @@ -112,13 +127,12 @@ export function NewRun() { modelSearchValueRef.current = search modelSearchResultsRef.current.clear() - for (const { - obj: { id }, - score, - } of fuzzysort.go(search, models.data || [], { + const results = fuzzysort.go(search, models.data || [], { key: "name", - })) { - modelSearchResultsRef.current.set(id, score) + }) + + for (const result of results) { + modelSearchResultsRef.current.set(result.obj.id, result.score) } } @@ -210,16 +224,18 @@ export function NewRun() { No model found. - {models.data?.map(({ id, name }) => ( + {models.data?.map((modelItem) => ( - {name} + {modelItem.name} diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index c35673885c..e5bad98ffb 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -51,7 +51,16 @@ export function Run({ run, taskMetrics }: RunProps) { return ( <> - {run.model} + +
+
{run.model}
+ {run.contextWindow && ( +
+ {(run.contextWindow / 1000).toFixed(0)}k context +
+ )} +
+
{run.passed} {run.failed} @@ -76,7 +85,19 @@ export function Run({ run, taskMetrics }: RunProps) {
)} - {taskMetrics && formatCurrency(taskMetrics.cost)} + + {taskMetrics && ( +
+
{formatCurrency(taskMetrics.cost)}
+ {(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && ( +
+ ${run.pricePerMillionInputTokens?.toFixed(2) || "?"}/$ + {run.pricePerMillionOutputTokens?.toFixed(2) || "?"}/M +
+ )} +
+ )} +
{taskMetrics && formatDuration(taskMetrics.duration)} diff --git a/apps/web-evals/src/hooks/use-open-router-models.ts b/apps/web-evals/src/hooks/use-open-router-models.ts index 27800f90f2..b16183291d 100644 --- a/apps/web-evals/src/hooks/use-open-router-models.ts +++ b/apps/web-evals/src/hooks/use-open-router-models.ts @@ -1,9 +1,17 @@ import { z } from "zod" import { useQuery } from "@tanstack/react-query" +// Extended schema to include context window and pricing information export const openRouterModelSchema = z.object({ id: z.string(), name: z.string(), + context_length: z.number().optional(), + pricing: z + .object({ + prompt: z.union([z.string(), z.number()]).optional(), + completion: z.union([z.string(), z.number()]).optional(), + }) + .optional(), }) export type OpenRouterModel = z.infer @@ -29,4 +37,30 @@ export const useOpenRouterModels = () => useQuery({ queryKey: ["getOpenRouterModels"], queryFn: getOpenRouterModels, + staleTime: 1000 * 60 * 60, // Cache for 1 hour + gcTime: 1000 * 60 * 60 * 24, // Keep in cache for 24 hours (gcTime replaces cacheTime in v5) }) + +// Helper function to get model details by ID +export const getModelDetails = (models: OpenRouterModel[] | undefined, modelId: string) => { + if (!models) return null + return models.find((m) => m.id === modelId) +} + +// Helper function to convert pricing to per-million tokens +export const getPricingPerMillion = (pricing: OpenRouterModel["pricing"]) => { + if (!pricing) return { input: undefined, output: undefined } + + const parsePrice = (price: string | number | undefined): number | undefined => { + if (price === undefined) return undefined + const numPrice = typeof price === "string" ? parseFloat(price) : price + if (isNaN(numPrice)) return undefined + // OpenRouter prices are typically per token, convert to per million + return numPrice * 1_000_000 + } + + return { + input: parsePrice(pricing.prompt), + output: parsePrice(pricing.completion), + } +} diff --git a/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql b/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql new file mode 100644 index 0000000000..f8bf3d0158 --- /dev/null +++ b/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql @@ -0,0 +1,3 @@ +ALTER TABLE "runs" ADD COLUMN "context_window" integer;--> statement-breakpoint +ALTER TABLE "runs" ADD COLUMN "price_per_million_input_tokens" real;--> statement-breakpoint +ALTER TABLE "runs" ADD COLUMN "price_per_million_output_tokens" real; \ No newline at end of file diff --git a/packages/evals/src/db/migrations/meta/0002_snapshot.json b/packages/evals/src/db/migrations/meta/0002_snapshot.json new file mode 100644 index 0000000000..1cd9762de0 --- /dev/null +++ b/packages/evals/src/db/migrations/meta/0002_snapshot.json @@ -0,0 +1,435 @@ +{ + "id": "4b1272d7-8100-4ceb-9cc8-dcf6eedace13", + "prevId": "43b197c4-ff4f-48c1-908b-a330e66a162d", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.runs": { + "name": "runs", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "runs_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "task_metrics_id": { + "name": "task_metrics_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "model": { + "name": "model", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "context_window": { + "name": "context_window", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "price_per_million_input_tokens": { + "name": "price_per_million_input_tokens", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "price_per_million_output_tokens": { + "name": "price_per_million_output_tokens", + "type": "real", + "primaryKey": false, + "notNull": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "settings": { + "name": "settings", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "pid": { + "name": "pid", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "socket_path": { + "name": "socket_path", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "concurrency": { + "name": "concurrency", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 2 + }, + "timeout": { + "name": "timeout", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 5 + }, + "passed": { + "name": "passed", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "failed": { + "name": "failed", + "type": "integer", + "primaryKey": false, + "notNull": true, + "default": 0 + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "runs_task_metrics_id_taskMetrics_id_fk": { + "name": "runs_task_metrics_id_taskMetrics_id_fk", + "tableFrom": "runs", + "tableTo": "taskMetrics", + "columnsFrom": ["task_metrics_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.taskMetrics": { + "name": "taskMetrics", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "taskMetrics_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "tokens_in": { + "name": "tokens_in", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tokens_out": { + "name": "tokens_out", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tokens_context": { + "name": "tokens_context", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cache_writes": { + "name": "cache_writes", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cache_reads": { + "name": "cache_reads", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "cost": { + "name": "cost", + "type": "real", + "primaryKey": false, + "notNull": true + }, + "duration": { + "name": "duration", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "tool_usage": { + "name": "tool_usage", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.tasks": { + "name": "tasks", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "tasks_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "run_id": { + "name": "run_id", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "task_metrics_id": { + "name": "task_metrics_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "language": { + "name": "language", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "exercise": { + "name": "exercise", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "passed": { + "name": "passed", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "finished_at": { + "name": "finished_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "tasks_language_exercise_idx": { + "name": "tasks_language_exercise_idx", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "language", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "exercise", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "tasks_run_id_runs_id_fk": { + "name": "tasks_run_id_runs_id_fk", + "tableFrom": "tasks", + "tableTo": "runs", + "columnsFrom": ["run_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + }, + "tasks_task_metrics_id_taskMetrics_id_fk": { + "name": "tasks_task_metrics_id_taskMetrics_id_fk", + "tableFrom": "tasks", + "tableTo": "taskMetrics", + "columnsFrom": ["task_metrics_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.toolErrors": { + "name": "toolErrors", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "identity": { + "type": "always", + "name": "toolErrors_id_seq", + "schema": "public", + "increment": "1", + "startWith": "1", + "minValue": "1", + "maxValue": "2147483647", + "cache": "1", + "cycle": false + } + }, + "run_id": { + "name": "run_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "task_id": { + "name": "task_id", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "tool_name": { + "name": "tool_name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": {}, + "foreignKeys": { + "toolErrors_run_id_runs_id_fk": { + "name": "toolErrors_run_id_runs_id_fk", + "tableFrom": "toolErrors", + "tableTo": "runs", + "columnsFrom": ["run_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + }, + "toolErrors_task_id_tasks_id_fk": { + "name": "toolErrors_task_id_tasks_id_fk", + "tableFrom": "toolErrors", + "tableTo": "tasks", + "columnsFrom": ["task_id"], + "columnsTo": ["id"], + "onDelete": "no action", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": {}, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/evals/src/db/migrations/meta/_journal.json b/packages/evals/src/db/migrations/meta/_journal.json index e20425b105..e3eeaf5732 100644 --- a/packages/evals/src/db/migrations/meta/_journal.json +++ b/packages/evals/src/db/migrations/meta/_journal.json @@ -15,6 +15,13 @@ "when": 1753198630651, "tag": "0001_lowly_captain_flint", "breakpoints": true + }, + { + "idx": 2, + "version": "7", + "when": 1757188795695, + "tag": "0002_little_scarlet_witch", + "breakpoints": true } ] } diff --git a/packages/evals/src/db/schema.ts b/packages/evals/src/db/schema.ts index 73705ac054..10c7ae7a99 100644 --- a/packages/evals/src/db/schema.ts +++ b/packages/evals/src/db/schema.ts @@ -13,6 +13,9 @@ export const runs = pgTable("runs", { id: integer().primaryKey().generatedAlwaysAsIdentity(), taskMetricsId: integer("task_metrics_id").references(() => taskMetrics.id), model: text().notNull(), + contextWindow: integer("context_window"), + pricePerMillionInputTokens: real("price_per_million_input_tokens"), + pricePerMillionOutputTokens: real("price_per_million_output_tokens"), description: text(), settings: jsonb().$type(), pid: integer(),