Skip to content

Commit f1a8af9

Browse files
committed
add gemini evals
1 parent 6425d01 commit f1a8af9

File tree

6 files changed

+135
-19
lines changed

6 files changed

+135
-19
lines changed

apps/workers-bindings/evals/kv_namespaces.eval.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ eachModel('$modelName', ({ model }) => {
1919
task: async (input: string) => {
2020
const client = await initializeClient(/* Pass necessary mocks/config */)
2121
const { promptOutput, toolCalls } = await runTask(client, model, input)
22-
22+
console.log('Creating kv namespace', JSON.stringify(toolCalls, null, 2))
2323
const toolCall = toolCalls.find(
2424
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_create
2525
)
@@ -41,7 +41,7 @@ eachModel('$modelName', ({ model }) => {
4141
task: async (input: string) => {
4242
const client = await initializeClient(/* Pass necessary mocks/config */)
4343
const { promptOutput, toolCalls } = await runTask(client, model, input)
44-
44+
console.log('Listing kv namespaces', JSON.stringify(toolCalls, null, 2))
4545
const toolCall = toolCalls.find(
4646
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespaces_list
4747
)
@@ -57,13 +57,14 @@ eachModel('$modelName', ({ model }) => {
5757
data: async () => [
5858
{
5959
input:
60-
'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
60+
'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace". Assume the namespace exists. No need to look it up.',
6161
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_update} tool should be called to rename the kv namespace.`,
6262
},
6363
],
6464
task: async (input: string) => {
6565
const client = await initializeClient(/* Pass necessary mocks/config */)
6666
const { promptOutput, toolCalls } = await runTask(client, model, input)
67+
console.log('Renaming kv namespace', JSON.stringify(toolCalls, null, 2))
6768
const toolCall = toolCalls.find(
6869
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_update
6970
)
@@ -85,7 +86,7 @@ eachModel('$modelName', ({ model }) => {
8586
task: async (input: string) => {
8687
const client = await initializeClient(/* Pass necessary mocks/config */)
8788
const { promptOutput, toolCalls } = await runTask(client, model, input)
88-
89+
console.log('Getting kv namespace details', JSON.stringify(toolCalls, null, 2))
8990
const toolCall = toolCalls.find(
9091
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_get
9192
)
@@ -100,14 +101,13 @@ eachModel('$modelName', ({ model }) => {
100101
describeEval('Delete Cloudflare KV Namespace', {
101102
data: async () => [
102103
{
103-
input: 'Look up the id of my only KV namespace and delete it.',
104+
input: 'Delete the "my-new-test-namespace" kv namespace.',
104105
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_delete} tool should be called to delete the kv namespace.`,
105106
},
106107
],
107108
task: async (input: string) => {
108109
const client = await initializeClient(/* Pass necessary mocks/config */)
109110
const { promptOutput, toolCalls } = await runTask(client, model, input)
110-
111111
const toolCall = toolCalls.find(
112112
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_delete
113113
)

apps/workers-bindings/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"deploy": "wrangler deploy",
99
"deploy:staging": "wrangler deploy --env staging",
1010
"deploy:production": "wrangler deploy --env production",
11-
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
11+
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest kv_namespaces.eval.ts --testTimeout=60000 --config vitest.config.evals.ts'",
1212
"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:[email protected] --inspector-port 9230 --port 8977",
1313
"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
1414
"dev": "wrangler dev",

packages/eval-tools/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
},
1313
"dependencies": {
1414
"@ai-sdk/anthropic": "1.2.11",
15+
"@ai-sdk/google": "^1.2.17",
1516
"@ai-sdk/openai": "1.3.20",
1617
"@cloudflare/vitest-pool-workers": "0.8.14",
1718
"agents": "0.0.67",
1819
"ai": "4.3.10",
20+
"ai-gateway-provider": "^0.0.6",
1921
"workers-ai-provider": "0.3.0",
2022
"wrangler": "4.10.0",
2123
"zod": "3.24.2"

packages/eval-tools/src/runTask.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import { type MCPClientManager } from 'agents/mcp/client'
2-
import { jsonSchema, streamText, tool } from 'ai'
2+
import { generateText, jsonSchema, tool } from 'ai'
33
import { z } from 'zod'
44

5-
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
5+
import type { GenerateTextResult, LanguageModelV1, ToolCallPart, ToolSet } from 'ai'
66

77
export async function runTask(
88
clientManager: MCPClientManager,
99
model: LanguageModelV1,
1010
input: string
1111
): Promise<{
1212
promptOutput: string
13-
fullResult: StreamTextResult<ToolSet, never>
13+
fullResult: GenerateTextResult<ToolSet, never>
1414
toolCalls: ToolCallPart[]
1515
}> {
1616
const tools = clientManager.listTools()
@@ -43,7 +43,7 @@ export async function runTask(
4343
return acc
4444
}, {} as ToolSet)
4545

46-
const res = streamText({
46+
const res = await generateText({
4747
model,
4848
system:
4949
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
@@ -55,13 +55,13 @@ export async function runTask(
5555

5656
// we need to consume the fill stream, so this is empty
5757
// eslint-disable-next-line no-empty
58-
for await (const _ of res.fullStream) {
59-
}
58+
// for await (const _ of res.fullStream) {
59+
// }
6060

6161
// convert into an LLM readable result so our factuality checker can validate tool calls
6262
let messagesWithTools = ''
6363
const toolCalls: ToolCallPart[] = []
64-
const response = await res.response
64+
const response = res.response;
6565
const messages = response.messages
6666

6767
for (const message of messages) {

packages/eval-tools/src/test-models.ts

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@ import { createAnthropic } from '@ai-sdk/anthropic'
22
import { AnthropicMessagesModelId } from '@ai-sdk/anthropic/internal'
33
import { createOpenAI } from '@ai-sdk/openai'
44
import { OpenAIChatModelId } from '@ai-sdk/openai/internal'
5+
import { createAiGateway } from 'ai-gateway-provider'
6+
import { createGoogleGenerativeAI } from '@ai-sdk/google'
57
import { env } from 'cloudflare:test'
68
import { describe } from 'vitest'
79
import { createWorkersAI } from 'workers-ai-provider'
10+
import { GoogleGenerativeAILanguageModel } from '@ai-sdk/google/internal'
811

912
export const factualityModel = getOpenAiModel('gpt-4o')
1013

@@ -30,14 +33,39 @@ function getOpenAiModel(modelName: OpenAIChatModelId) {
3033
}
3134

3235
function getAnthropicModel(modelName: AnthropicMessagesModelId) {
33-
if (!env.ANTHROPIC_KEY) {
34-
throw new Error('No Anthropic key set!')
36+
if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
37+
throw new Error('No AI gateway credentials set!')
3538
}
39+
40+
const aigateway = createAiGateway({
41+
accountId: env.CLOUDFLARE_ACCOUNT_ID,
42+
gateway: env.AI_GATEWAY_ID,
43+
apiKey: env.AI_GATEWAY_TOKEN,
44+
});
45+
3646
const ai = createAnthropic({
37-
apiKey: env.ANTHROPIC_KEY,
47+
apiKey: '',
3848
})
3949

40-
const model = ai(modelName)
50+
const model = aigateway([ai(modelName)]);
51+
52+
return { modelName, model, ai }
53+
}
54+
55+
function getGeminiModel(modelName: GoogleGenerativeAILanguageModel['modelId']) {
56+
if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
57+
throw new Error('No AI gateway credentials set!')
58+
}
59+
60+
const aigateway = createAiGateway({
61+
accountId: env.CLOUDFLARE_ACCOUNT_ID,
62+
gateway: env.AI_GATEWAY_ID,
63+
apiKey: env.AI_GATEWAY_TOKEN,
64+
});
65+
66+
const ai = createGoogleGenerativeAI({ apiKey: ''})
67+
68+
const model = aigateway([ai(modelName)])
4169

4270
return { modelName, model, ai }
4371
}
@@ -56,7 +84,8 @@ function getWorkersAiModel(modelName: AiTextGenerationModels) {
5684
export const eachModel = describe.each([
5785
getOpenAiModel('gpt-4o'),
5886
getOpenAiModel('gpt-4o-mini'),
59-
getAnthropicModel('claude-3-5-sonnet-latest'),
87+
getAnthropicModel('claude-3-5-sonnet-20241022'),
88+
getGeminiModel('gemini-2.5-pro-exp-03-25')
6089
// llama 3 is somewhat inconsistent
6190
//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
6291
// Currently llama 4 is having issues with tool calling

pnpm-lock.yaml

Lines changed: 85 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)