feat: add openai and gemini evals (#154)

deloreyj · web-flow · commit a20708d52b98 · 2025-05-15T13:25:00.000-05:00
* feat: add anthropic models to evals

* chore: CI fixes

* chore: remove unused import and delete one model to avoid rate limits from anthropic

* chore: use 3.5-sonnet

* add gemini evals

* chore: syncpack things

* chore: update types

* feat: working gemini evals

* chore: remove anthropic eval for now

* chore: fix formatting

* feat: update openai models to use ai wholesaling
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
@@ -21,6 +21,12 @@ jobs:
           echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
           echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/sandbox-container/.dev.vars
           echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
+          echo "AI_GATEWAY_TOKEN=${{ secrets.AI_GATEWAY_TOKEN }}" >> ./apps/sandbox-container/.dev.vars
+          echo "AI_GATEWAY_TOKEN=${{ secrets.AI_GATEWAY_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
+          echo "CLOUDFLARE_ACCOUNT_ID=${{ secrets.CLOUDFLARE_ACCOUNT_ID }}" >> ./apps/sandbox-container/.dev.vars
+          echo "CLOUDFLARE_ACCOUNT_ID=${{ secrets.CLOUDFLARE_ACCOUNT_ID }}" >> ./apps/workers-bindings/.dev.vars
+          echo "AI_GATEWAY_ID=${{ secrets.AI_GATEWAY_ID }}" >> ./apps/sandbox-container/.dev.vars
+          echo "AI_GATEWAY_ID=${{ secrets.AI_GATEWAY_ID }}" >> ./apps/workers-bindings/.dev.vars
       - name: Verify .dev.vars file
         run: |
           du -h ./apps/sandbox-container/.dev.vars
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -10,6 +10,32 @@
 			"attachExistingChildren": false,
 			"autoAttachChildProcesses": false,
 			"sourceMaps": true // works with or without this line
+		},
+		{
+			"type": "node",
+			"request": "launch",
+			"name": "Open inspector with Vitest",
+			"runtimeExecutable": "npm",
+			"runtimeArgs": ["run", "eval:dev"],
+			"console": "integratedTerminal",
+			"cwd": "${workspaceFolder}/apps/workers-bindings"
+		},
+		{
+			"name": "Attach to Workers Runtime",
+			"type": "node",
+			"request": "attach",
+			"port": 9229,
+			"cwd": "/",
+			"resolveSourceMapLocations": null,
+			"attachExistingChildren": false,
+			"autoAttachChildProcesses": false
+		}
+	],
+	"compounds": [
+		{
+			"name": "Debug Workers tests",
+			"configurations": ["Open inspector with Vitest", "Attach to Workers Runtime"],
+			"stopAll": true
 		}
 	]
 }
diff --git a/apps/sandbox-container/server/sandbox.server.context.ts b/apps/sandbox-container/server/sandbox.server.context.ts
@@ -8,6 +8,9 @@ export interface Env {
 	MCP_SERVER_NAME: string
 	MCP_SERVER_VERSION: string
 	OPENAI_API_KEY: string
+	AI_GATEWAY_TOKEN: string
+	CLOUDFLARE_ACCOUNT_ID: string
+	AI_GATEWAY_ID: string
 	MCP_OBJECT: DurableObjectNamespace<ContainerMcpAgent>
 	CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
 	USER_CONTAINER: DurableObjectNamespace<UserContainer>
diff --git a/apps/sandbox-container/types.d.ts b/apps/sandbox-container/types.d.ts
@@ -1,6 +1,9 @@
 declare module 'cloudflare:test' {
 	interface ProvidedEnv {
 		OPENAI_API_KEY: 'TODO'
+		AI_GATEWAY_TOKEN: string
+		CLOUDFLARE_ACCOUNT_ID: string
+		AI_GATEWAY_ID: string
 		AI: Ai
 	}
 }
diff --git a/apps/workers-bindings/evals/kv_namespaces.eval.ts b/apps/workers-bindings/evals/kv_namespaces.eval.ts
@@ -19,7 +19,6 @@ eachModel('$modelName', ({ model }) => {
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_create
 			)
@@ -41,7 +40,6 @@ eachModel('$modelName', ({ model }) => {
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespaces_list
 			)
@@ -56,15 +54,13 @@ eachModel('$modelName', ({ model }) => {
 	describeEval('Rename Cloudflare KV Namespace', {
 		data: async () => [
 			{
-				input:
-					'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
+				input: 'Rename my Cloudflare KV Namespace with ID 1234 to "my-new-test-namespace".',
 				expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_update} tool should be called to rename the kv namespace.`,
 			},
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_update
 			)
@@ -79,14 +75,13 @@ eachModel('$modelName', ({ model }) => {
 	describeEval('Get Cloudflare KV Namespace Details', {
 		data: async () => [
 			{
-				input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
+				input: 'Get details of my Cloudflare KV Namespace with ID 1234.',
 				expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_get} tool should be called to retrieve the details of the kv namespace.`,
 			},
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_get
 			)
@@ -101,14 +96,13 @@ eachModel('$modelName', ({ model }) => {
 	describeEval('Delete Cloudflare KV Namespace', {
 		data: async () => [
 			{
-				input: 'Look up the id of my only KV namespace and delete it.',
+				input: 'Delete the kv namespace with ID 1234.',
 				expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_delete} tool should be called to delete the kv namespace.`,
 			},
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_delete
 			)
diff --git a/apps/workers-bindings/src/bindings.context.ts b/apps/workers-bindings/src/bindings.context.ts
@@ -16,5 +16,8 @@ export interface Env {
 	DEV_CLOUDFLARE_EMAIL: string
 	CLOUDFLARE_API_TOKEN: string
 	OPENAI_API_KEY: string
+	AI_GATEWAY_TOKEN: string
+	CLOUDFLARE_ACCOUNT_ID: string
+	AI_GATEWAY_ID: string
 	AI: Ai
 }
diff --git a/package.json b/package.json
@@ -20,6 +20,7 @@
 		"fix:deps": "run-fix-deps",
 		"test:watch": "vitest",
 		"eval:ci": "run-turbo eval:ci",
+		"eval:dev": "run-turbo eval:dev",
 		"update-deps": "syncpack update"
 	},
 	"devDependencies": {
diff --git a/packages/eval-tools/package.json b/packages/eval-tools/package.json
@@ -11,10 +11,13 @@
 		"bin": "bin"
 	},
 	"dependencies": {
+		"@ai-sdk/anthropic": "1.2.11",
+		"@ai-sdk/google": "1.2.17",
 		"@ai-sdk/openai": "1.3.20",
 		"@cloudflare/vitest-pool-workers": "0.8.14",
 		"agents": "0.0.67",
 		"ai": "4.3.10",
+		"ai-gateway-provider": "0.0.6",
 		"workers-ai-provider": "0.3.0",
 		"wrangler": "4.10.0",
 		"zod": "3.24.2"
diff --git a/packages/eval-tools/src/runTask.ts b/packages/eval-tools/src/runTask.ts
@@ -1,16 +1,16 @@
 import { type MCPClientManager } from 'agents/mcp/client'
-import { jsonSchema, streamText, tool } from 'ai'
+import { generateText, jsonSchema, tool } from 'ai'
 import { z } from 'zod'
 
-import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+import type { GenerateTextResult, LanguageModelV1, ToolCallPart, ToolSet } from 'ai'
 
 export async function runTask(
 	clientManager: MCPClientManager,
 	model: LanguageModelV1,
 	input: string
 ): Promise<{
 	promptOutput: string
-	fullResult: StreamTextResult<ToolSet, never>
+	fullResult: GenerateTextResult<ToolSet, never>
 	toolCalls: ToolCallPart[]
 }> {
 	const tools = clientManager.listTools()
@@ -43,7 +43,7 @@ export async function runTask(
 		return acc
 	}, {} as ToolSet)
 
-	const res = streamText({
+	const res = await generateText({
 		model,
 		system:
 			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
@@ -53,15 +53,10 @@ export async function runTask(
 		maxSteps: 10,
 	})
 
-	// we need to consume the fill stream, so this is empty
-	// eslint-disable-next-line no-empty
-	for await (const _ of res.fullStream) {
-	}
-
 	// convert into an LLM readable result so our factuality checker can validate tool calls
 	let messagesWithTools = ''
 	const toolCalls: ToolCallPart[] = []
-	const response = await res.response
+	const response = res.response
 	const messages = response.messages
 
 	for (const message of messages) {
diff --git a/packages/eval-tools/src/test-models.ts b/packages/eval-tools/src/test-models.ts
@@ -1,4 +1,10 @@
+import { createAnthropic } from '@ai-sdk/anthropic'
+import { AnthropicMessagesModelId } from '@ai-sdk/anthropic/internal'
+import { createGoogleGenerativeAI } from '@ai-sdk/google'
+import { GoogleGenerativeAILanguageModel } from '@ai-sdk/google/internal'
 import { createOpenAI } from '@ai-sdk/openai'
+import { OpenAIChatModelId } from '@ai-sdk/openai/internal'
+import { createAiGateway } from 'ai-gateway-provider'
 import { env } from 'cloudflare:test'
 import { describe } from 'vitest'
 import { createWorkersAI } from 'workers-ai-provider'
@@ -13,15 +19,56 @@ type AiTextGenerationModels = Exclude<
 	value2key<AiModels, BaseAiTextToImage>
 >
 
-function getOpenAiModel(modelName: string) {
-	if (!env.OPENAI_API_KEY) {
-		throw new Error('No API token set!')
+function getOpenAiModel(modelName: OpenAIChatModelId) {
+	if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
+		throw new Error('No AI gateway credentials set!')
 	}
+
+	const aigateway = createAiGateway({
+		accountId: env.CLOUDFLARE_ACCOUNT_ID,
+		gateway: env.AI_GATEWAY_ID,
+		apiKey: env.AI_GATEWAY_TOKEN,
+	})
+
 	const ai = createOpenAI({
-		apiKey: env.OPENAI_API_KEY,
+		apiKey: '',
 	})
 
-	const model = ai(modelName)
+	const model = aigateway([ai(modelName)])
+
+	return { modelName, model, ai }
+}
+
+function getAnthropicModel(modelName: AnthropicMessagesModelId) {
+	const aigateway = createAiGateway({
+		accountId: env.CLOUDFLARE_ACCOUNT_ID,
+		gateway: env.AI_GATEWAY_ID,
+		apiKey: env.AI_GATEWAY_TOKEN,
+	})
+
+	const ai = createAnthropic({
+		apiKey: '',
+	})
+
+	const model = aigateway([ai(modelName)])
+
+	return { modelName, model, ai }
+}
+
+function getGeminiModel(modelName: GoogleGenerativeAILanguageModel['modelId']) {
+	if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
+		throw new Error('No AI gateway credentials set!')
+	}
+
+	const aigateway = createAiGateway({
+		accountId: env.CLOUDFLARE_ACCOUNT_ID,
+		gateway: env.AI_GATEWAY_ID,
+		apiKey: env.AI_GATEWAY_TOKEN,
+	})
+
+	const ai = createGoogleGenerativeAI({ apiKey: '' })
+
+	const model = aigateway([ai(modelName)])
 
 	return { modelName, model, ai }
 }
@@ -40,11 +87,10 @@ function getWorkersAiModel(modelName: AiTextGenerationModels) {
 export const eachModel = describe.each([
 	getOpenAiModel('gpt-4o'),
 	getOpenAiModel('gpt-4o-mini'),
-
+	// getAnthropicModel('claude-3-5-sonnet-20241022'), TODO: The evals pass with anthropic, but our rate limit is so low with AI wholesaling that we can't use it in CI because it's impossible to get a complete run with the current limits
+	getGeminiModel('gemini-2.0-flash'),
 	// llama 3 is somewhat inconsistent
 	//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
 	// Currently llama 4 is having issues with tool calling
 	//getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")
-
-	// TODO: add Claude, Gemini, new OpenAI models via AI gateway
 ])
diff --git a/packages/eval-tools/wrangler.json b/packages/eval-tools/wrangler.json
@@ -2,7 +2,10 @@
 	"name": "stub-worker",
 	"compatibility_date": "2025-04-14",
 	"vars": {
-		"OPENAI_API_KEY": "TODO"
+		"OPENAI_API_KEY": "TODO",
+		"AI_GATEWAY_TOKEN": "TODO",
+		"CLOUDFLARE_ACCOUNT_ID": "TODO",
+		"AI_GATEWAY_ID": "TODO"
 	},
 	"ai": {
 		"binding": "AI"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,9 @@`
`1`	`1`	`declare module 'cloudflare:test' {`
`2`	`2`	`interface ProvidedEnv {`
`3`	`3`	`OPENAI_API_KEY: 'TODO'`
	`4`	`+ AI_GATEWAY_TOKEN: string`
	`5`	`+ CLOUDFLARE_ACCOUNT_ID: string`
	`6`	`+ AI_GATEWAY_ID: string`
`4`	`7`	`AI: Ai`
`5`	`8`	`}`
`6`	`9`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,8 @@ export interface Env {`
`16`	`16`	`DEV_CLOUDFLARE_EMAIL: string`
`17`	`17`	`CLOUDFLARE_API_TOKEN: string`
`18`	`18`	`OPENAI_API_KEY: string`
	`19`	`+ AI_GATEWAY_TOKEN: string`
	`20`	`+ CLOUDFLARE_ACCOUNT_ID: string`
	`21`	`+ AI_GATEWAY_ID: string`
`19`	`22`	`AI: Ai`
`20`	`23`	`}`