Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions apps/web-evals/src/actions/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,21 @@ import { CreateRun } from "@/lib/schemas"

const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")

// eslint-disable-next-line @typescript-eslint/no-unused-vars
export async function createRun({ suite, exercises = [], systemPrompt, timeout, ...values }: CreateRun) {
export async function createRun({
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intentional? The function now accepts additional parameters that aren't part of the CreateRun schema. Consider creating a proper extended type for better type safety:

Suggested change
export async function createRun({
type CreateRunWithPricing = CreateRun & {
contextWindow?: number;
pricePerMillionInputTokens?: number;
pricePerMillionOutputTokens?: number;
}
export async function createRun({
suite,
exercises = [],
timeout,
contextWindow,
pricePerMillionInputTokens,
pricePerMillionOutputTokens,
...values
}: CreateRunWithPricing) {

suite,
exercises = [],
timeout,
contextWindow,
pricePerMillionInputTokens,
pricePerMillionOutputTokens,
...values
}: CreateRun & { contextWindow?: number; pricePerMillionInputTokens?: number; pricePerMillionOutputTokens?: number }) {
const run = await _createRun({
...values,
timeout,
contextWindow,
pricePerMillionInputTokens,
pricePerMillionOutputTokens,
socketPath: "", // TODO: Get rid of this.
})

Expand Down
9 changes: 9 additions & 0 deletions apps/web-evals/src/app/runs/[id]/run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ export function Run({ run }: { run: Run }) {
<div className="mb-2">
<div>
<div className="font-mono">{run.model}</div>
<div className="flex gap-4 text-sm text-muted-foreground">
{run.contextWindow && <span>Context: {(run.contextWindow / 1000).toFixed(0)}k tokens</span>}
{(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && (
<span>
Pricing: ${run.pricePerMillionInputTokens?.toFixed(2) || "?"} / $
{run.pricePerMillionOutputTokens?.toFixed(2) || "?"} per 1M tokens
</span>
)}
</div>
{run.description && <div className="text-sm text-muted-foreground">{run.description}</div>}
</div>
{!run.taskMetricsId && <RunStatus runStatus={runStatus} />}
Expand Down
42 changes: 29 additions & 13 deletions apps/web-evals/src/app/runs/new/new-run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import {
TIMEOUT_DEFAULT,
} from "@/lib/schemas"
import { cn } from "@/lib/utils"
import { useOpenRouterModels } from "@/hooks/use-open-router-models"
import { useOpenRouterModels, getModelDetails, getPricingPerMillion } from "@/hooks/use-open-router-models"
import {
Button,
FormControl,
Expand Down Expand Up @@ -95,6 +95,21 @@ export function NewRun() {
try {
if (mode === "openrouter") {
values.settings = { ...(values.settings || {}), openRouterModelId: model }

// Get model details and add to the run
const modelDetails = getModelDetails(models.data, model)
if (modelDetails) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When getModelDetails returns null, we silently fall through to the regular createRun without pricing data. Should we log a warning here to help with debugging?

Suggested change
if (modelDetails) {
if (modelDetails) {
const pricing = getPricingPerMillion(modelDetails.pricing)
const extendedValues = {
...values,
contextWindow: modelDetails.context_length,
pricePerMillionInputTokens: pricing.input,
pricePerMillionOutputTokens: pricing.output,
}
const { id } = await createRun(extendedValues)
router.push(`/runs/${id}`)
return
} else {
console.warn(`Model details not found for OpenRouter model: ${model}`)
}

const pricing = getPricingPerMillion(modelDetails.pricing)
const extendedValues = {
...values,
contextWindow: modelDetails.context_length,
pricePerMillionInputTokens: pricing.input,
pricePerMillionOutputTokens: pricing.output,
}
const { id } = await createRun(extendedValues)
router.push(`/runs/${id}`)
return
}
}

const { id } = await createRun(values)
Expand All @@ -103,7 +118,7 @@ export function NewRun() {
toast.error(e instanceof Error ? e.message : "An unknown error occurred.")
}
},
[mode, model, router],
[mode, model, models.data, router],
)

const onFilterModels = useCallback(
Expand All @@ -112,13 +127,12 @@ export function NewRun() {
modelSearchValueRef.current = search
modelSearchResultsRef.current.clear()

for (const {
obj: { id },
score,
} of fuzzysort.go(search, models.data || [], {
const results = fuzzysort.go(search, models.data || [], {
key: "name",
})) {
modelSearchResultsRef.current.set(id, score)
})

for (const result of results) {
modelSearchResultsRef.current.set(result.obj.id, result.score)
}
}

Expand Down Expand Up @@ -210,16 +224,18 @@ export function NewRun() {
<CommandList>
<CommandEmpty>No model found.</CommandEmpty>
<CommandGroup>
{models.data?.map(({ id, name }) => (
{models.data?.map((modelItem) => (
<CommandItem
key={id}
value={id}
key={modelItem.id}
value={modelItem.id}
onSelect={onSelectModel}>
{name}
{modelItem.name}
<Check
className={cn(
"ml-auto text-accent group-data-[selected=true]:text-accent-foreground size-4",
id === model ? "opacity-100" : "opacity-0",
modelItem.id === model
? "opacity-100"
: "opacity-0",
)}
/>
</CommandItem>
Expand Down
25 changes: 23 additions & 2 deletions apps/web-evals/src/components/home/run.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,16 @@ export function Run({ run, taskMetrics }: RunProps) {
return (
<>
<TableRow>
<TableCell>{run.model}</TableCell>
<TableCell>
<div>
<div>{run.model}</div>
{run.contextWindow && (
<div className="text-xs text-muted-foreground">
{(run.contextWindow / 1000).toFixed(0)}k context
</div>
)}
</div>
</TableCell>
<TableCell>{run.passed}</TableCell>
<TableCell>{run.failed}</TableCell>
<TableCell>
Expand All @@ -76,7 +85,19 @@ export function Run({ run, taskMetrics }: RunProps) {
</div>
)}
</TableCell>
<TableCell>{taskMetrics && formatCurrency(taskMetrics.cost)}</TableCell>
<TableCell>
{taskMetrics && (
<div>
<div>{formatCurrency(taskMetrics.cost)}</div>
{(run.pricePerMillionInputTokens || run.pricePerMillionOutputTokens) && (
<div className="text-xs text-muted-foreground">
${run.pricePerMillionInputTokens?.toFixed(2) || "?"}/$
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pricing display format here shows as "$/$/M" which differs from the format in runs/[id]/run.tsx that shows "Pricing: $ / $ per 1M tokens". Could we standardize the display format across both components for consistency?

{run.pricePerMillionOutputTokens?.toFixed(2) || "?"}/M
</div>
)}
</div>
)}
</TableCell>
<TableCell>{taskMetrics && formatDuration(taskMetrics.duration)}</TableCell>
<TableCell>
<DropdownMenu>
Expand Down
34 changes: 34 additions & 0 deletions apps/web-evals/src/hooks/use-open-router-models.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
import { z } from "zod"
import { useQuery } from "@tanstack/react-query"

// Extended schema to include context window and pricing information
export const openRouterModelSchema = z.object({
id: z.string(),
name: z.string(),
context_length: z.number().optional(),
pricing: z
.object({
prompt: z.union([z.string(), z.number()]).optional(),
completion: z.union([z.string(), z.number()]).optional(),
})
.optional(),
})

export type OpenRouterModel = z.infer<typeof openRouterModelSchema>
Expand All @@ -29,4 +37,30 @@ export const useOpenRouterModels = () =>
useQuery({
queryKey: ["getOpenRouterModels"],
queryFn: getOpenRouterModels,
staleTime: 1000 * 60 * 60, // Cache for 1 hour
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 1-hour cache time is hardcoded. Would it be useful to make this configurable for different environments (dev/staging/prod)?

Suggested change
staleTime: 1000 * 60 * 60, // Cache for 1 hour
const CACHE_DURATION = process.env.OPENROUTER_CACHE_DURATION ? parseInt(process.env.OPENROUTER_CACHE_DURATION) : 1000 * 60 * 60;
export const useOpenRouterModels = () =>
useQuery({
queryKey: ["getOpenRouterModels"],
queryFn: getOpenRouterModels,
staleTime: CACHE_DURATION, // Cache based on environment
gcTime: CACHE_DURATION * 24, // Keep in cache for 24x the cache duration
})

gcTime: 1000 * 60 * 60 * 24, // Keep in cache for 24 hours (gcTime replaces cacheTime in v5)
})

// Helper function to get model details by ID
export const getModelDetails = (models: OpenRouterModel[] | undefined, modelId: string) => {
if (!models) return null
return models.find((m) => m.id === modelId)
}

// Helper function to convert pricing to per-million tokens
export const getPricingPerMillion = (pricing: OpenRouterModel["pricing"]) => {
if (!pricing) return { input: undefined, output: undefined }

const parsePrice = (price: string | number | undefined): number | undefined => {
if (price === undefined) return undefined
const numPrice = typeof price === "string" ? parseFloat(price) : price
if (isNaN(numPrice)) return undefined
// OpenRouter prices are typically per token, convert to per million
return numPrice * 1_000_000
}

return {
input: parsePrice(pricing.prompt),
output: parsePrice(pricing.completion),
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ALTER TABLE "runs" ADD COLUMN "context_window" integer;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "price_per_million_input_tokens" real;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "price_per_million_output_tokens" real;
Loading
Loading