add gemini evals

deloreyj · deloreyj · commit f1a8af960346 · 2025-05-09T16:42:22.000-05:00
diff --git a/apps/workers-bindings/evals/kv_namespaces.eval.ts b/apps/workers-bindings/evals/kv_namespaces.eval.ts
@@ -19,7 +19,7 @@ eachModel('$modelName', ({ model }) => {
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
+			console.log('Creating kv namespace', JSON.stringify(toolCalls, null, 2))
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_create
 			)
@@ -41,7 +41,7 @@ eachModel('$modelName', ({ model }) => {
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
+			console.log('Listing kv namespaces', JSON.stringify(toolCalls, null, 2))
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespaces_list
 			)
@@ -57,13 +57,14 @@ eachModel('$modelName', ({ model }) => {
 		data: async () => [
 			{
 				input:
-					'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
+					'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace". Assume the namespace exists. No need to look it up.',
 				expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_update} tool should be called to rename the kv namespace.`,
 			},
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			console.log('Renaming kv namespace', JSON.stringify(toolCalls, null, 2))
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_update
 			)
@@ -85,7 +86,7 @@ eachModel('$modelName', ({ model }) => {
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
+			console.log('Getting kv namespace details', JSON.stringify(toolCalls, null, 2))
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_get
 			)
@@ -100,14 +101,13 @@ eachModel('$modelName', ({ model }) => {
 	describeEval('Delete Cloudflare KV Namespace', {
 		data: async () => [
 			{
-				input: 'Look up the id of my only KV namespace and delete it.',
+				input: 'Delete the "my-new-test-namespace" kv namespace.',
 				expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_delete} tool should be called to delete the kv namespace.`,
 			},
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
 			const { promptOutput, toolCalls } = await runTask(client, model, input)
-
 			const toolCall = toolCalls.find(
 				(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_delete
 			)
diff --git a/apps/workers-bindings/package.json b/apps/workers-bindings/package.json
@@ -8,7 +8,7 @@
 		"deploy": "wrangler deploy",
 		"deploy:staging": "wrangler deploy --env staging",
 		"deploy:production": "wrangler deploy --env production",
-		"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
+		"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest kv_namespaces.eval.ts --testTimeout=60000 --config vitest.config.evals.ts'",
 		"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:mcp-server-eval-account@workers-for-platforms-dev.cfdata.org --inspector-port 9230 --port 8977",
 		"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
 		"dev": "wrangler dev",
diff --git a/packages/eval-tools/package.json b/packages/eval-tools/package.json
@@ -12,10 +12,12 @@
 	},
 	"dependencies": {
 		"@ai-sdk/anthropic": "1.2.11",
+		"@ai-sdk/google": "^1.2.17",
 		"@ai-sdk/openai": "1.3.20",
 		"@cloudflare/vitest-pool-workers": "0.8.14",
 		"agents": "0.0.67",
 		"ai": "4.3.10",
+		"ai-gateway-provider": "^0.0.6",
 		"workers-ai-provider": "0.3.0",
 		"wrangler": "4.10.0",
 		"zod": "3.24.2"
diff --git a/packages/eval-tools/src/runTask.ts b/packages/eval-tools/src/runTask.ts
@@ -1,16 +1,16 @@
 import { type MCPClientManager } from 'agents/mcp/client'
-import { jsonSchema, streamText, tool } from 'ai'
+import { generateText, jsonSchema, tool } from 'ai'
 import { z } from 'zod'
 
-import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+import type { GenerateTextResult, LanguageModelV1, ToolCallPart, ToolSet } from 'ai'
 
 export async function runTask(
 	clientManager: MCPClientManager,
 	model: LanguageModelV1,
 	input: string
 ): Promise<{
 	promptOutput: string
-	fullResult: StreamTextResult<ToolSet, never>
+	fullResult: GenerateTextResult<ToolSet, never>
 	toolCalls: ToolCallPart[]
 }> {
 	const tools = clientManager.listTools()
@@ -43,7 +43,7 @@ export async function runTask(
 		return acc
 	}, {} as ToolSet)
 
-	const res = streamText({
+	const res = await generateText({
 		model,
 		system:
 			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
@@ -55,13 +55,13 @@ export async function runTask(
 
 	// we need to consume the fill stream, so this is empty
 	// eslint-disable-next-line no-empty
-	for await (const _ of res.fullStream) {
-	}
+	// for await (const _ of res.fullStream) {
+	// }
 
 	// convert into an LLM readable result so our factuality checker can validate tool calls
 	let messagesWithTools = ''
 	const toolCalls: ToolCallPart[] = []
-	const response = await res.response
+	const response = res.response;
 	const messages = response.messages
 
 	for (const message of messages) {
diff --git a/packages/eval-tools/src/test-models.ts b/packages/eval-tools/src/test-models.ts
@@ -2,9 +2,12 @@ import { createAnthropic } from '@ai-sdk/anthropic'
 import { AnthropicMessagesModelId } from '@ai-sdk/anthropic/internal'
 import { createOpenAI } from '@ai-sdk/openai'
 import { OpenAIChatModelId } from '@ai-sdk/openai/internal'
+import { createAiGateway } from 'ai-gateway-provider'
+import { createGoogleGenerativeAI } from '@ai-sdk/google'
 import { env } from 'cloudflare:test'
 import { describe } from 'vitest'
 import { createWorkersAI } from 'workers-ai-provider'
+import { GoogleGenerativeAILanguageModel } from '@ai-sdk/google/internal'
 
 export const factualityModel = getOpenAiModel('gpt-4o')
 
@@ -30,14 +33,39 @@ function getOpenAiModel(modelName: OpenAIChatModelId) {
 }
 
 function getAnthropicModel(modelName: AnthropicMessagesModelId) {
-	if (!env.ANTHROPIC_KEY) {
-		throw new Error('No Anthropic key set!')
+	if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
+		throw new Error('No AI gateway credentials set!')
 	}
+
+	const aigateway = createAiGateway({
+		accountId: env.CLOUDFLARE_ACCOUNT_ID,
+		gateway: env.AI_GATEWAY_ID,
+		apiKey: env.AI_GATEWAY_TOKEN,
+	});
+	
 	const ai = createAnthropic({
-		apiKey: env.ANTHROPIC_KEY,
+		apiKey: '',
 	})
 
-	const model = ai(modelName)
+	const model = aigateway([ai(modelName)]);
+
+	return { modelName, model, ai }
+}
+
+function getGeminiModel(modelName: GoogleGenerativeAILanguageModel['modelId']) {
+	if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
+		throw new Error('No AI gateway credentials set!')
+	}
+
+	const aigateway = createAiGateway({
+		accountId: env.CLOUDFLARE_ACCOUNT_ID,
+		gateway: env.AI_GATEWAY_ID,
+		apiKey: env.AI_GATEWAY_TOKEN,
+	});
+
+	const ai = createGoogleGenerativeAI({ apiKey: ''})
+
+	const model = aigateway([ai(modelName)])
 
 	return { modelName, model, ai }
 }
@@ -56,7 +84,8 @@ function getWorkersAiModel(modelName: AiTextGenerationModels) {
 export const eachModel = describe.each([
 	getOpenAiModel('gpt-4o'),
 	getOpenAiModel('gpt-4o-mini'),
-	getAnthropicModel('claude-3-5-sonnet-latest'),
+	getAnthropicModel('claude-3-5-sonnet-20241022'),
+	getGeminiModel('gemini-2.5-pro-exp-03-25')
 	// llama 3 is somewhat inconsistent
 	//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
 	// Currently llama 4 is having issues with tool calling
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml