Skip to content

Commit f62afcc

Browse files
committed
feat: add hyperdrive bindings and evals
1 parent 895162f commit f62afcc

File tree

10 files changed

+624
-19
lines changed

10 files changed

+624
-19
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import { expect, test } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
// Placeholder for actual helper functions - adjust path/implementation as needed
8+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
9+
10+
// Define a mock account ID for testing
11+
const MOCK_ACCOUNT_ID = 'mock-account-12345'
12+
13+
eachModel('$modelName', ({ model }) => {
14+
describeEval('Account Tool Evaluations', {
15+
// Test cases for account tools
16+
data: async () => [
17+
{
18+
input: 'List all my Cloudflare accounts.',
19+
expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
20+
// No evalMeta needed here
21+
},
22+
{
23+
input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
24+
expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
25+
// No evalMeta needed here
26+
},
27+
// TODO: Add more test cases, e.g., edge cases, invalid inputs?
28+
],
29+
// The core task execution logic, accepting only input
30+
task: async (input: string) => {
31+
// Initialize the testing client/environment
32+
const client = await initializeClient(/* Pass necessary mocks/config */)
33+
34+
// Run the task (send input to the agent/model)
35+
// Ensure runTaskResult matches the defined type or adjust accordingly
36+
const { promptOutput, toolCalls }: RunTaskResult = await runTask(client, model, input)
37+
38+
// Assertions based on the input
39+
if (input.includes('List all my Cloudflare accounts')) {
40+
const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
41+
expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
42+
} else if (input.includes(`Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}`)) {
43+
const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
44+
expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
45+
46+
// Check arguments passed to set_active_account
47+
expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
48+
expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
49+
)
50+
51+
// Specific check for set_active_account: verify the agent's state was updated
52+
// This requires agent instance to be returned from runTask
53+
const activeAccountId = await agent.getActiveAccountId() // Assuming agent has this method
54+
expect(
55+
activeAccountId,
56+
'Agent activeAccountId was not updated after set_active_account call'
57+
).toBe(MOCK_ACCOUNT_ID)
58+
}
59+
60+
// Return the model's final output for scoring
61+
return promptOutput
62+
},
63+
// Scoring functions to evaluate the outcome against the 'expected' description
64+
scorers: [checkFactuality],
65+
// Passing threshold (1 = perfect score required)
66+
threshold: 1,
67+
// Timeout per test case
68+
timeout: 60000, // 60 seconds
69+
})
70+
})
File renamed without changes.
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
2+
import { MCPClientManager } from 'agents/mcp/client'
3+
import { streamText, tool } from 'ai'
4+
import { z } from 'zod'
5+
6+
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
7+
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
8+
9+
export async function initializeClient(): Promise<MCPClientManager> {
10+
const clientManager = new MCPClientManager('test-client', '0.0.0')
11+
await clientManager.connect('http://localhost:8976/sse')
12+
return clientManager
13+
}
14+
15+
export async function runTask(
16+
clientManager: MCPClientManager,
17+
model: LanguageModelV1,
18+
input: string
19+
): Promise<{
20+
promptOutput: string
21+
fullResult: StreamTextResult<ToolSet, never>
22+
toolCalls: ToolCallPart[]
23+
}> {
24+
const tools = clientManager.listTools()
25+
const toolSet: ToolSet = tools.reduce((acc, v) => {
26+
acc[v.name] = tool({
27+
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
28+
description: v.description,
29+
execute: async (args, opts) => {
30+
try {
31+
const res = await clientManager.callTool(
32+
{
33+
...v,
34+
arguments: { ...args },
35+
},
36+
z.any() as any,
37+
{ signal: opts.abortSignal }
38+
)
39+
return res.content
40+
} catch (e) {
41+
console.log('Error calling tool')
42+
console.log(e)
43+
return e
44+
}
45+
},
46+
})
47+
return acc
48+
}, {} as ToolSet)
49+
50+
const res = streamText({
51+
model,
52+
system:
53+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
54+
tools: toolSet,
55+
prompt: input,
56+
maxRetries: 1,
57+
maxSteps: 10,
58+
})
59+
60+
for await (const part of res.fullStream) {
61+
}
62+
63+
// convert into an LLM readable result so our factuality checker can validate tool calls
64+
let messagesWithTools = ''
65+
const toolCalls: ToolCallPart[] = []
66+
const messages = (await res.response).messages
67+
for (const message of messages) {
68+
console.log(message.content)
69+
for (const messagePart of message.content) {
70+
if (typeof messagePart === 'string') {
71+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
72+
} else if (messagePart.type === 'tool-call') {
73+
messagesWithTools += `<message_content type=${messagePart.type}>
74+
<tool_name>${messagePart.toolName}</tool_name>
75+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
76+
</message_content>`
77+
toolCalls.push(messagePart)
78+
} else if (messagePart.type === 'text') {
79+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
80+
}
81+
}
82+
}
83+
84+
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
85+
}

apps/workers-bindings/package.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
"deploy": "wrangler deploy",
99
"deploy:staging": "wrangler deploy --env staging",
1010
"deploy:production": "wrangler deploy --env production",
11+
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
12+
"eval:server": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:test\"\"",
13+
"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
1114
"dev": "wrangler dev",
1215
"start": "wrangler dev",
1316
"types": "wrangler types --include-env=false",
@@ -25,10 +28,15 @@
2528
"@cloudflare/workers-oauth-provider": "0.0.3",
2629
"@modelcontextprotocol/sdk": "1.10.2",
2730
"@n8n/json-schema-to-zod": "1.1.0",
31+
"@repo/eval-tools": "workspace:*",
2832
"@repo/mcp-common": "workspace:*",
2933
"@repo/mcp-observability": "workspace:*",
3034
"agents": "0.0.67",
35+
"ai": "4.3.6",
36+
"concurrently": "9.1.2",
3137
"hono": "4.7.6",
38+
"start-server-and-test": "2.0.11",
39+
"vitest-evals": "0.1.4",
3240
"zod": "3.24.2"
3341
}
3442
}

apps/workers-bindings/test/index.test.ts

Lines changed: 0 additions & 7 deletions
This file was deleted.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
22
"extends": "@repo/typescript-config/workers.json",
3-
"include": ["*/**.ts"]
3+
"include": ["*/**.ts", "./vitest.config.evals.ts"]
44
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
2+
3+
export default defineWorkersConfig({
4+
test: {
5+
include: ['**/*.eval.?(c|m)[jt]s?(x)'],
6+
poolOptions: {
7+
workers: {
8+
isolatedStorage: true,
9+
wrangler: { configPath: './wrangler.jsonc' },
10+
miniflare: {
11+
bindings: {
12+
ENVIRONMENT: 'test',
13+
},
14+
},
15+
},
16+
},
17+
},
18+
})

0 commit comments

Comments
 (0)