diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts
index 2eae1f6804..b89d404ab7 100644
--- a/apps/web-evals/src/actions/runs.ts
+++ b/apps/web-evals/src/actions/runs.ts
@@ -21,11 +21,21 @@ import { CreateRun } from "@/lib/schemas"
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
-// eslint-disable-next-line @typescript-eslint/no-unused-vars
-export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) {
+export async function createRun({
+ suite,
+ exercises = [],
+ timeout,
+ contextWindow,
+ pricePerMillionInputTokens,
+ pricePerMillionOutputTokens,
+ ...values
+}: CreateRun & { contextWindow?: number; pricePerMillionInputTokens?: number; pricePerMillionOutputTokens?: number }) {
const run = await _createRun({
...values,
timeout,
+ contextWindow,
+ pricePerMillionInputTokens,
+ pricePerMillionOutputTokens,
socketPath: "", // TODO: Get rid of this.
})
diff --git a/apps/web-evals/src/app/runs/[id]/run.tsx b/apps/web-evals/src/app/runs/[id]/run.tsx
index b6c5290b13..95792f16d4 100644
--- a/apps/web-evals/src/app/runs/[id]/run.tsx
+++ b/apps/web-evals/src/app/runs/[id]/run.tsx
@@ -47,6 +47,15 @@ export function Run({ run }: { run: Run }) {
{run.model}
+
+ {run.contextWindow && Context: {(run.contextWindow / 1000).toFixed(0)}k tokens}
+ {(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && (
+
+ Pricing: ${run.pricePerMillionInputTokens?.toFixed(2) || "?"} / $
+ {run.pricePerMillionOutputTokens?.toFixed(2) || "?"} per 1M tokens
+
+ )}
+
{run.description &&
{run.description}
}
{!run.taskMetricsId &&
}
diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx
index 41d35f3c4c..7f2c39263b 100644
--- a/apps/web-evals/src/app/runs/new/new-run.tsx
+++ b/apps/web-evals/src/app/runs/new/new-run.tsx
@@ -26,7 +26,7 @@ import {
TIMEOUT_DEFAULT,
} from "@/lib/schemas"
import { cn } from "@/lib/utils"
-import { useOpenRouterModels } from "@/hooks/use-open-router-models"
+import { useOpenRouterModels, getModelDetails, getPricingPerMillion } from "@/hooks/use-open-router-models"
import {
Button,
FormControl,
@@ -95,6 +95,21 @@ export function NewRun() {
try {
if (mode === "openrouter") {
values.settings = { ...(values.settings || {}), openRouterModelId: model }
+
+ // Get model details and add to the run
+ const modelDetails = getModelDetails(models.data, model)
+ if (modelDetails) {
+ const pricing = getPricingPerMillion(modelDetails.pricing)
+ const extendedValues = {
+ ...values,
+ contextWindow: modelDetails.context_length,
+ pricePerMillionInputTokens: pricing.input,
+ pricePerMillionOutputTokens: pricing.output,
+ }
+ const { id } = await createRun(extendedValues)
+ router.push(`/runs/${id}`)
+ return
+ }
}
const { id } = await createRun(values)
@@ -103,7 +118,7 @@ export function NewRun() {
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
}
},
- [mode, model, router],
+ [mode, model, models.data, router],
)
const onFilterModels = useCallback(
@@ -112,13 +127,12 @@ export function NewRun() {
modelSearchValueRef.current = search
modelSearchResultsRef.current.clear()
- for (const {
- obj: { id },
- score,
- } of fuzzysort.go(search, models.data || [], {
+ const results = fuzzysort.go(search, models.data || [], {
key: "name",
- })) {
- modelSearchResultsRef.current.set(id, score)
+ })
+
+ for (const result of results) {
+ modelSearchResultsRef.current.set(result.obj.id, result.score)
}
}
@@ -210,16 +224,18 @@ export function NewRun() {
No model found.
- {models.data?.map(({ id, name }) => (
+ {models.data?.map((modelItem) => (
- {name}
+ {modelItem.name}
diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx
index c35673885c..e5bad98ffb 100644
--- a/apps/web-evals/src/components/home/run.tsx
+++ b/apps/web-evals/src/components/home/run.tsx
@@ -51,7 +51,16 @@ export function Run({ run, taskMetrics }: RunProps) {
return (
<>
- {run.model}
+
+
+
{run.model}
+ {run.contextWindow && (
+
+ {(run.contextWindow / 1000).toFixed(0)}k context
+
+ )}
+
+
{run.passed}
{run.failed}
@@ -76,7 +85,19 @@ export function Run({ run, taskMetrics }: RunProps) {
)}
- {taskMetrics && formatCurrency(taskMetrics.cost)}
+
+ {taskMetrics && (
+
+
{formatCurrency(taskMetrics.cost)}
+ {(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && (
+
+ ${run.pricePerMillionInputTokens?.toFixed(2) || "?"}/$
+ {run.pricePerMillionOutputTokens?.toFixed(2) || "?"}/M
+
+ )}
+
+ )}
+
{taskMetrics && formatDuration(taskMetrics.duration)}
diff --git a/apps/web-evals/src/hooks/use-open-router-models.ts b/apps/web-evals/src/hooks/use-open-router-models.ts
index 27800f90f2..b16183291d 100644
--- a/apps/web-evals/src/hooks/use-open-router-models.ts
+++ b/apps/web-evals/src/hooks/use-open-router-models.ts
@@ -1,9 +1,17 @@
import { z } from "zod"
import { useQuery } from "@tanstack/react-query"
+// Extended schema to include context window and pricing information
export const openRouterModelSchema = z.object({
id: z.string(),
name: z.string(),
+ context_length: z.number().optional(),
+ pricing: z
+ .object({
+ prompt: z.union([z.string(), z.number()]).optional(),
+ completion: z.union([z.string(), z.number()]).optional(),
+ })
+ .optional(),
})
export type OpenRouterModel = z.infer
@@ -29,4 +37,30 @@ export const useOpenRouterModels = () =>
useQuery({
queryKey: ["getOpenRouterModels"],
queryFn: getOpenRouterModels,
+ staleTime: 1000 * 60 * 60, // Cache for 1 hour
+ gcTime: 1000 * 60 * 60 * 24, // Keep in cache for 24 hours (gcTime replaces cacheTime in v5)
})
+
+// Helper function to get model details by ID
+export const getModelDetails = (models: OpenRouterModel[] | undefined, modelId: string) => {
+ if (!models) return null
+ return models.find((m) => m.id === modelId)
+}
+
+// Helper function to convert pricing to per-million tokens
+export const getPricingPerMillion = (pricing: OpenRouterModel["pricing"]) => {
+ if (!pricing) return { input: undefined, output: undefined }
+
+ const parsePrice = (price: string | number | undefined): number | undefined => {
+ if (price === undefined) return undefined
+ const numPrice = typeof price === "string" ? parseFloat(price) : price
+ if (isNaN(numPrice)) return undefined
+ // OpenRouter prices are typically per token, convert to per million
+ return numPrice * 1_000_000
+ }
+
+ return {
+ input: parsePrice(pricing.prompt),
+ output: parsePrice(pricing.completion),
+ }
+}
diff --git a/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql b/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql
new file mode 100644
index 0000000000..f8bf3d0158
--- /dev/null
+++ b/packages/evals/src/db/migrations/0002_little_scarlet_witch.sql
@@ -0,0 +1,3 @@
+ALTER TABLE "runs" ADD COLUMN "context_window" integer;--> statement-breakpoint
+ALTER TABLE "runs" ADD COLUMN "price_per_million_input_tokens" real;--> statement-breakpoint
+ALTER TABLE "runs" ADD COLUMN "price_per_million_output_tokens" real;
\ No newline at end of file
diff --git a/packages/evals/src/db/migrations/meta/0002_snapshot.json b/packages/evals/src/db/migrations/meta/0002_snapshot.json
new file mode 100644
index 0000000000..1cd9762de0
--- /dev/null
+++ b/packages/evals/src/db/migrations/meta/0002_snapshot.json
@@ -0,0 +1,435 @@
+{
+ "id": "4b1272d7-8100-4ceb-9cc8-dcf6eedace13",
+ "prevId": "43b197c4-ff4f-48c1-908b-a330e66a162d",
+ "version": "7",
+ "dialect": "postgresql",
+ "tables": {
+ "public.runs": {
+ "name": "runs",
+ "schema": "",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "integer",
+ "primaryKey": true,
+ "notNull": true,
+ "identity": {
+ "type": "always",
+ "name": "runs_id_seq",
+ "schema": "public",
+ "increment": "1",
+ "startWith": "1",
+ "minValue": "1",
+ "maxValue": "2147483647",
+ "cache": "1",
+ "cycle": false
+ }
+ },
+ "task_metrics_id": {
+ "name": "task_metrics_id",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "model": {
+ "name": "model",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "context_window": {
+ "name": "context_window",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "price_per_million_input_tokens": {
+ "name": "price_per_million_input_tokens",
+ "type": "real",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "price_per_million_output_tokens": {
+ "name": "price_per_million_output_tokens",
+ "type": "real",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "description": {
+ "name": "description",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "settings": {
+ "name": "settings",
+ "type": "jsonb",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "pid": {
+ "name": "pid",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "socket_path": {
+ "name": "socket_path",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "concurrency": {
+ "name": "concurrency",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "default": 2
+ },
+ "timeout": {
+ "name": "timeout",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "default": 5
+ },
+ "passed": {
+ "name": "passed",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "default": 0
+ },
+ "failed": {
+ "name": "failed",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true,
+ "default": 0
+ },
+ "created_at": {
+ "name": "created_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": true
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "runs_task_metrics_id_taskMetrics_id_fk": {
+ "name": "runs_task_metrics_id_taskMetrics_id_fk",
+ "tableFrom": "runs",
+ "tableTo": "taskMetrics",
+ "columnsFrom": ["task_metrics_id"],
+ "columnsTo": ["id"],
+ "onDelete": "no action",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {},
+ "policies": {},
+ "checkConstraints": {},
+ "isRLSEnabled": false
+ },
+ "public.taskMetrics": {
+ "name": "taskMetrics",
+ "schema": "",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "integer",
+ "primaryKey": true,
+ "notNull": true,
+ "identity": {
+ "type": "always",
+ "name": "taskMetrics_id_seq",
+ "schema": "public",
+ "increment": "1",
+ "startWith": "1",
+ "minValue": "1",
+ "maxValue": "2147483647",
+ "cache": "1",
+ "cycle": false
+ }
+ },
+ "tokens_in": {
+ "name": "tokens_in",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "tokens_out": {
+ "name": "tokens_out",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "tokens_context": {
+ "name": "tokens_context",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "cache_writes": {
+ "name": "cache_writes",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "cache_reads": {
+ "name": "cache_reads",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "cost": {
+ "name": "cost",
+ "type": "real",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "duration": {
+ "name": "duration",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "tool_usage": {
+ "name": "tool_usage",
+ "type": "jsonb",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "created_at": {
+ "name": "created_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": true
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {},
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {},
+ "policies": {},
+ "checkConstraints": {},
+ "isRLSEnabled": false
+ },
+ "public.tasks": {
+ "name": "tasks",
+ "schema": "",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "integer",
+ "primaryKey": true,
+ "notNull": true,
+ "identity": {
+ "type": "always",
+ "name": "tasks_id_seq",
+ "schema": "public",
+ "increment": "1",
+ "startWith": "1",
+ "minValue": "1",
+ "maxValue": "2147483647",
+ "cache": "1",
+ "cycle": false
+ }
+ },
+ "run_id": {
+ "name": "run_id",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "task_metrics_id": {
+ "name": "task_metrics_id",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "language": {
+ "name": "language",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "exercise": {
+ "name": "exercise",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "passed": {
+ "name": "passed",
+ "type": "boolean",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "started_at": {
+ "name": "started_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "finished_at": {
+ "name": "finished_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "created_at": {
+ "name": "created_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": true
+ }
+ },
+ "indexes": {
+ "tasks_language_exercise_idx": {
+ "name": "tasks_language_exercise_idx",
+ "columns": [
+ {
+ "expression": "run_id",
+ "isExpression": false,
+ "asc": true,
+ "nulls": "last"
+ },
+ {
+ "expression": "language",
+ "isExpression": false,
+ "asc": true,
+ "nulls": "last"
+ },
+ {
+ "expression": "exercise",
+ "isExpression": false,
+ "asc": true,
+ "nulls": "last"
+ }
+ ],
+ "isUnique": true,
+ "concurrently": false,
+ "method": "btree",
+ "with": {}
+ }
+ },
+ "foreignKeys": {
+ "tasks_run_id_runs_id_fk": {
+ "name": "tasks_run_id_runs_id_fk",
+ "tableFrom": "tasks",
+ "tableTo": "runs",
+ "columnsFrom": ["run_id"],
+ "columnsTo": ["id"],
+ "onDelete": "no action",
+ "onUpdate": "no action"
+ },
+ "tasks_task_metrics_id_taskMetrics_id_fk": {
+ "name": "tasks_task_metrics_id_taskMetrics_id_fk",
+ "tableFrom": "tasks",
+ "tableTo": "taskMetrics",
+ "columnsFrom": ["task_metrics_id"],
+ "columnsTo": ["id"],
+ "onDelete": "no action",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {},
+ "policies": {},
+ "checkConstraints": {},
+ "isRLSEnabled": false
+ },
+ "public.toolErrors": {
+ "name": "toolErrors",
+ "schema": "",
+ "columns": {
+ "id": {
+ "name": "id",
+ "type": "integer",
+ "primaryKey": true,
+ "notNull": true,
+ "identity": {
+ "type": "always",
+ "name": "toolErrors_id_seq",
+ "schema": "public",
+ "increment": "1",
+ "startWith": "1",
+ "minValue": "1",
+ "maxValue": "2147483647",
+ "cache": "1",
+ "cycle": false
+ }
+ },
+ "run_id": {
+ "name": "run_id",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "task_id": {
+ "name": "task_id",
+ "type": "integer",
+ "primaryKey": false,
+ "notNull": false
+ },
+ "tool_name": {
+ "name": "tool_name",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "error": {
+ "name": "error",
+ "type": "text",
+ "primaryKey": false,
+ "notNull": true
+ },
+ "created_at": {
+ "name": "created_at",
+ "type": "timestamp",
+ "primaryKey": false,
+ "notNull": true
+ }
+ },
+ "indexes": {},
+ "foreignKeys": {
+ "toolErrors_run_id_runs_id_fk": {
+ "name": "toolErrors_run_id_runs_id_fk",
+ "tableFrom": "toolErrors",
+ "tableTo": "runs",
+ "columnsFrom": ["run_id"],
+ "columnsTo": ["id"],
+ "onDelete": "no action",
+ "onUpdate": "no action"
+ },
+ "toolErrors_task_id_tasks_id_fk": {
+ "name": "toolErrors_task_id_tasks_id_fk",
+ "tableFrom": "toolErrors",
+ "tableTo": "tasks",
+ "columnsFrom": ["task_id"],
+ "columnsTo": ["id"],
+ "onDelete": "no action",
+ "onUpdate": "no action"
+ }
+ },
+ "compositePrimaryKeys": {},
+ "uniqueConstraints": {},
+ "policies": {},
+ "checkConstraints": {},
+ "isRLSEnabled": false
+ }
+ },
+ "enums": {},
+ "schemas": {},
+ "sequences": {},
+ "roles": {},
+ "policies": {},
+ "views": {},
+ "_meta": {
+ "columns": {},
+ "schemas": {},
+ "tables": {}
+ }
+}
diff --git a/packages/evals/src/db/migrations/meta/_journal.json b/packages/evals/src/db/migrations/meta/_journal.json
index e20425b105..e3eeaf5732 100644
--- a/packages/evals/src/db/migrations/meta/_journal.json
+++ b/packages/evals/src/db/migrations/meta/_journal.json
@@ -15,6 +15,13 @@
"when": 1753198630651,
"tag": "0001_lowly_captain_flint",
"breakpoints": true
+ },
+ {
+ "idx": 2,
+ "version": "7",
+ "when": 1757188795695,
+ "tag": "0002_little_scarlet_witch",
+ "breakpoints": true
}
]
}
diff --git a/packages/evals/src/db/schema.ts b/packages/evals/src/db/schema.ts
index 73705ac054..10c7ae7a99 100644
--- a/packages/evals/src/db/schema.ts
+++ b/packages/evals/src/db/schema.ts
@@ -13,6 +13,9 @@ export const runs = pgTable("runs", {
id: integer().primaryKey().generatedAlwaysAsIdentity(),
taskMetricsId: integer("task_metrics_id").references(() => taskMetrics.id),
model: text().notNull(),
+ contextWindow: integer("context_window"),
+ pricePerMillionInputTokens: real("price_per_million_input_tokens"),
+ pricePerMillionOutputTokens: real("price_per_million_output_tokens"),
description: text(),
settings: jsonb().$type(),
pid: integer(),