Skip to content

Commit 38aa001

Browse files
authored
Add evals for the bindings server and a hyperdrive binding (cloudflare#117)
* feat: add hyperdrive bindings and evals * chore: fix dev mode and change CI to use dev mode * chore: package version updates * chore: fix formatting * chore: do not fail with no tests * fix: make evals work * fix: formatting * fix: change port * fix: override inspector port * chore: remove console.logs * chore: fix formatting * chore: PR feedback --------- Co-authored-by: jdelorey@cloudflare.com <jdelorey@cloudflare.com>
1 parent 83574f9 commit 38aa001

File tree

22 files changed

+812
-69
lines changed

22 files changed

+812
-69
lines changed

.github/workflows/evals.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,12 @@ jobs:
2222
- name: Create .dev.vars file
2323
run: |
2424
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
25+
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
26+
echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
2527
- name: Verify .dev.vars file
2628
run: |
2729
du -h ./apps/sandbox-container/.dev.vars
30+
du -h ./apps/workers-bindings/.dev.vars
2831
- name: Install dependencies
2932
run: pnpm install
3033
- name: Run evals

apps/demo-day/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"@modelcontextprotocol/sdk": "1.10.2",
1515
"@repo/mcp-common": "workspace:*",
1616
"@repo/mcp-observability": "workspace:*",
17+
"@types/node": "22.14.1",
1718
"agents": "0.0.67",
1819
"zod": "3.24.2"
1920
},

apps/sandbox-container/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"@repo/eval-tools": "workspace:*",
2727
"@repo/mcp-common": "workspace:*",
2828
"@repo/mcp-observability": "workspace:*",
29+
"@types/node": "22.14.1",
2930
"agents": "0.0.67",
3031
"cron-schedule": "5.0.4",
3132
"esbuild": "0.25.1",
@@ -40,7 +41,7 @@
4041
"@cloudflare/vitest-pool-workers": "0.8.14",
4142
"@types/mock-fs": "4.13.4",
4243
"@types/node": "22.14.1",
43-
"ai": "4.3.6",
44+
"ai": "4.3.10",
4445
"concurrently": "9.1.2",
4546
"mock-fs": "5.5.0",
4647
"start-server-and-test": "2.0.11",

apps/sandbox-container/server/index.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,4 @@ export default {
7878
clientRegistrationEndpoint: '/register',
7979
}).fetch(req, env, ctx)
8080
},
81-
} /*
82-
83-
*/
81+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
// Define a mock account ID for testing
10+
const MOCK_ACCOUNT_ID = 'mock-account-12345'
11+
12+
eachModel('$modelName', ({ model }) => {
13+
describeEval('List Cloudflare Accounts', {
14+
data: async () => [
15+
{
16+
input: 'List all my Cloudflare accounts.',
17+
expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
18+
},
19+
],
20+
task: async (input: string) => {
21+
const client = await initializeClient()
22+
const { promptOutput, toolCalls } = await runTask(client, model, input)
23+
24+
const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
25+
expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
26+
return promptOutput
27+
},
28+
scorers: [checkFactuality],
29+
threshold: 1,
30+
timeout: 60000, // 60 seconds
31+
})
32+
describeEval('Set Active Cloudflare Account', {
33+
data: async () => [
34+
{
35+
input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
36+
expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
37+
},
38+
],
39+
task: async (input: string) => {
40+
const client = await initializeClient()
41+
const { promptOutput, toolCalls } = await runTask(client, model, input)
42+
const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
43+
expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
44+
45+
expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
46+
expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
47+
)
48+
return promptOutput
49+
},
50+
scorers: [checkFactuality],
51+
threshold: 1,
52+
timeout: 60000, // 60 seconds
53+
})
54+
})
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
const HYPERDRIVE_NAME = 'neon-test-hyperdrive'
10+
const HYPERDRIVE_DATABASE = 'neondb'
11+
const HYPERDRIVE_HOST = 'ep-late-cell-a4fm3g5p-pooler.us-east-1.aws.neon.tech'
12+
const HYPERDRIVE_PORT = 5432
13+
const HYPERDRIVE_USER = 'neondb_owner'
14+
const HYPERDRIVE_PASSWORD = 'my-test-password'
15+
16+
eachModel('$modelName', ({ model }) => {
17+
describeEval('Hyperdrive Tool Evaluations', {
18+
data: async () => [
19+
{
20+
input: `Create a new Hyperdrive configuration with the name "${HYPERDRIVE_NAME}" and the database "${HYPERDRIVE_DATABASE}" and the host "${HYPERDRIVE_HOST}" and the port "${HYPERDRIVE_PORT}" and the user "${HYPERDRIVE_USER}" and the password "${HYPERDRIVE_PASSWORD}".`,
21+
expected:
22+
'The hyperdrive_configs_create tool should be called to create a new hyperdrive configuration.',
23+
},
24+
],
25+
task: async (input: string) => {
26+
const client = await initializeClient(/* Pass necessary mocks/config */)
27+
const { promptOutput, toolCalls } = await runTask(client, model, input)
28+
29+
const toolCall = toolCalls.find((call) => call.toolName === 'hyperdrive_config_create')
30+
expect(toolCall, 'Tool hyperdrive_configs_create was not called').toBeDefined()
31+
32+
return promptOutput
33+
},
34+
scorers: [checkFactuality],
35+
threshold: 1,
36+
timeout: 60000, // 60 seconds
37+
})
38+
})
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
eachModel('$modelName', ({ model }) => {
10+
describeEval('Create Cloudflare KV Namespace', {
11+
data: async () => [
12+
{
13+
input: 'Create a new Cloudflare KV Namespace called "my-test-namespace".',
14+
expected: 'The kv_namespaces_create tool should be called to create a new kv namespace.',
15+
},
16+
],
17+
task: async (input: string) => {
18+
const client = await initializeClient(/* Pass necessary mocks/config */)
19+
const { promptOutput, toolCalls } = await runTask(client, model, input)
20+
21+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_create')
22+
expect(toolCall, 'Tool kv_namespace_create was not called').toBeDefined()
23+
24+
return promptOutput
25+
},
26+
scorers: [checkFactuality],
27+
threshold: 1,
28+
timeout: 60000, // 60 seconds
29+
})
30+
describeEval('List Cloudflare KV Namespaces', {
31+
data: async () => [
32+
{
33+
input: 'List all my Cloudflare KV Namespaces.',
34+
expected:
35+
'The kv_namespaces_list tool should be called to retrieve the list of kv namespaces. There should be at least one kv namespace in the list.',
36+
},
37+
],
38+
task: async (input: string) => {
39+
const client = await initializeClient(/* Pass necessary mocks/config */)
40+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
41+
42+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
43+
expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
44+
45+
return promptOutput
46+
},
47+
scorers: [checkFactuality],
48+
threshold: 1,
49+
timeout: 60000, // 60 seconds
50+
})
51+
describeEval('Rename Cloudflare KV Namespace', {
52+
data: async () => [
53+
{
54+
input:
55+
'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
56+
expected: 'The kv_namespace_update tool should be called to rename the kv namespace.',
57+
},
58+
],
59+
task: async (input: string) => {
60+
const client = await initializeClient(/* Pass necessary mocks/config */)
61+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
62+
63+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_update')
64+
expect(toolCall, 'Tool kv_namespace_update was not called').toBeDefined()
65+
66+
return promptOutput
67+
},
68+
scorers: [checkFactuality],
69+
threshold: 1,
70+
timeout: 60000, // 60 seconds
71+
})
72+
describeEval('Get Cloudflare KV Namespace Details', {
73+
data: async () => [
74+
{
75+
input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
76+
expected:
77+
'The kv_namespace_get tool should be called to retrieve the details of the kv namespace.',
78+
},
79+
],
80+
task: async (input: string) => {
81+
const client = await initializeClient(/* Pass necessary mocks/config */)
82+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
83+
84+
console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
85+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_get')
86+
expect(toolCall, 'Tool kv_namespace_get was not called').toBeDefined()
87+
88+
return promptOutput
89+
},
90+
scorers: [checkFactuality],
91+
threshold: 1,
92+
timeout: 60000, // 60 seconds
93+
})
94+
describeEval('Delete Cloudflare KV Namespace', {
95+
data: async () => [
96+
{
97+
input: 'Look up the id of my only KV namespace and delete it.',
98+
expected: 'The kv_namespace_delete tool should be called to delete the kv namespace.',
99+
},
100+
],
101+
task: async (input: string) => {
102+
const client = await initializeClient(/* Pass necessary mocks/config */)
103+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
104+
105+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_delete')
106+
expect(toolCall, 'Tool kv_namespace_delete was not called').toBeDefined()
107+
108+
return promptOutput
109+
},
110+
scorers: [checkFactuality],
111+
threshold: 1,
112+
timeout: 60000, // 60 seconds
113+
})
114+
})
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import { MCPClientManager } from 'agents/mcp/client'
2+
import { jsonSchema, streamText, tool } from 'ai'
3+
import { z } from 'zod'
4+
5+
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
6+
7+
export async function initializeClient(): Promise<MCPClientManager> {
8+
const clientManager = new MCPClientManager('test-client', '0.0.0')
9+
await clientManager.connect('http://localhost:8977/sse')
10+
return clientManager
11+
}
12+
13+
export async function runTask(
14+
clientManager: MCPClientManager,
15+
model: LanguageModelV1,
16+
input: string
17+
): Promise<{
18+
promptOutput: string
19+
fullResult: StreamTextResult<ToolSet, never>
20+
toolCalls: ToolCallPart[]
21+
}> {
22+
const tools = clientManager.listTools()
23+
const toolSet: ToolSet = tools.reduce((acc, v) => {
24+
if (!v.inputSchema.properties) {
25+
v.inputSchema.properties = {}
26+
}
27+
28+
acc[v.name] = tool({
29+
parameters: jsonSchema(v.inputSchema as any),
30+
description: v.description,
31+
execute: async (args: any, opts) => {
32+
try {
33+
const res = await clientManager.callTool(
34+
{
35+
...v,
36+
arguments: { ...args },
37+
},
38+
z.any() as any,
39+
{ signal: opts.abortSignal }
40+
)
41+
return res.content
42+
} catch (e) {
43+
console.log('Error calling tool')
44+
console.log(e)
45+
return e
46+
}
47+
},
48+
})
49+
return acc
50+
}, {} as ToolSet)
51+
52+
const res = streamText({
53+
model,
54+
system:
55+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
56+
tools: toolSet,
57+
prompt: input,
58+
maxRetries: 1,
59+
maxSteps: 10,
60+
})
61+
62+
for await (const part of res.fullStream) {
63+
}
64+
65+
// convert into an LLM readable result so our factuality checker can validate tool calls
66+
let messagesWithTools = ''
67+
const toolCalls: ToolCallPart[] = []
68+
const response = await res.response
69+
const messages = response.messages
70+
71+
for (const message of messages) {
72+
for (const messagePart of message.content) {
73+
if (typeof messagePart === 'string') {
74+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
75+
} else if (messagePart.type === 'tool-call') {
76+
messagesWithTools += `<message_content type=${messagePart.type}>
77+
<tool_name>${messagePart.toolName}</tool_name>
78+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
79+
</message_content>`
80+
toolCalls.push(messagePart)
81+
} else if (messagePart.type === 'text') {
82+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
83+
}
84+
}
85+
}
86+
87+
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
88+
}

apps/workers-bindings/package.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
"deploy": "wrangler deploy",
99
"deploy:staging": "wrangler deploy --env staging",
1010
"deploy:production": "wrangler deploy --env production",
11+
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
12+
"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:mcp-server-eval-account@workers-for-platforms-dev.cfdata.org --inspector-port 9230",
13+
"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
1114
"dev": "wrangler dev",
1215
"start": "wrangler dev",
1316
"types": "wrangler types --include-env=false",
@@ -25,10 +28,15 @@
2528
"@cloudflare/workers-oauth-provider": "0.0.5",
2629
"@modelcontextprotocol/sdk": "1.10.2",
2730
"@n8n/json-schema-to-zod": "1.1.0",
31+
"@repo/eval-tools": "workspace:*",
2832
"@repo/mcp-common": "workspace:*",
2933
"@repo/mcp-observability": "workspace:*",
3034
"agents": "0.0.67",
35+
"ai": "4.3.10",
36+
"concurrently": "9.1.2",
3137
"hono": "4.7.6",
38+
"start-server-and-test": "2.0.11",
39+
"vitest-evals": "0.1.4",
3240
"zod": "3.24.2"
3341
}
3442
}

0 commit comments

Comments
 (0)