Skip to content

Commit 0c5b6c5

Browse files
committed
feat: add hyperdrive bindings and evals
1 parent d0866b1 commit 0c5b6c5

File tree

23 files changed

+762
-68
lines changed

23 files changed

+762
-68
lines changed

.github/workflows/evals.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ jobs:
2222
- name: Create .dev.vars file
2323
run: |
2424
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
25+
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
2526
- name: Verify .dev.vars file
2627
run: |
2728
du -h ./apps/sandbox-container/.dev.vars
29+
du -h ./apps/workers-bindings/.dev.vars
2830
- name: Install dependencies
2931
run: pnpm install
3032
- name: Run evals

apps/cloudflare-one-casb/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"@hono/zod-validator": "0.4.3",
1717
"@modelcontextprotocol/sdk": "1.10.2",
1818
"@repo/mcp-common": "workspace:*",
19-
"agents": "0.0.67",
19+
"agents": "0.0.75",
2020
"cloudflare": "4.2.0",
2121
"hono": "4.7.6",
2222
"zod": "3.24.2"

apps/demo-day/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"@repo/mcp-common": "workspace:*",
1616
"@repo/mcp-observability": "workspace:*",
1717
"@types/node": "22.14.1",
18-
"agents": "0.0.67",
18+
"agents": "0.0.75",
1919
"zod": "3.24.2"
2020
},
2121
"devDependencies": {

apps/docs-autorag/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"@modelcontextprotocol/sdk": "1.10.2",
1818
"@repo/mcp-common": "workspace:*",
1919
"@repo/mcp-observability": "workspace:*",
20-
"agents": "0.0.67",
20+
"agents": "0.0.75",
2121
"cloudflare": "4.2.0",
2222
"hono": "4.7.6",
2323
"mime": "4.0.6",

apps/radar/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"@modelcontextprotocol/sdk": "1.10.2",
1818
"@repo/mcp-common": "workspace:*",
1919
"@repo/mcp-observability": "workspace:*",
20-
"agents": "0.0.67",
20+
"agents": "0.0.75",
2121
"cloudflare": "4.2.0",
2222
"hono": "4.7.6",
2323
"zod": "3.24.2"

apps/sandbox-container/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"@repo/mcp-common": "workspace:*",
2828
"@repo/mcp-observability": "workspace:*",
2929
"@types/node": "22.14.1",
30-
"agents": "0.0.67",
30+
"agents": "0.0.75",
3131
"cron-schedule": "5.0.4",
3232
"esbuild": "0.25.1",
3333
"hono": "4.7.6",
@@ -40,7 +40,7 @@
4040
"devDependencies": {
4141
"@cloudflare/vitest-pool-workers": "0.8.14",
4242
"@types/mock-fs": "4.13.4",
43-
"ai": "4.3.6",
43+
"ai": "4.3.10",
4444
"concurrently": "9.1.2",
4545
"mock-fs": "5.5.0",
4646
"start-server-and-test": "2.0.11",

apps/sandbox-container/server/index.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,4 @@ export default {
7878
clientRegistrationEndpoint: '/register',
7979
}).fetch(req, env, ctx)
8080
},
81-
} /*
82-
83-
*/
81+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
// Define a mock account ID for testing
10+
const MOCK_ACCOUNT_ID = 'mock-account-12345'
11+
12+
eachModel('$modelName', ({ model }) => {
13+
describeEval('Account Tool Evaluations', {
14+
data: async () => [
15+
{
16+
input: 'List all my Cloudflare accounts.',
17+
expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
18+
},
19+
{
20+
input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
21+
expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
22+
},
23+
],
24+
task: async (input: string) => {
25+
const client = await initializeClient(/* Pass necessary mocks/config */)
26+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
27+
28+
if (input.includes('List all my Cloudflare accounts')) {
29+
const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
30+
expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
31+
} else if (input.includes(`Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}`)) {
32+
const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
33+
expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
34+
35+
expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
36+
expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
37+
)
38+
}
39+
40+
return promptOutput
41+
},
42+
scorers: [checkFactuality],
43+
threshold: 1,
44+
timeout: 60000, // 60 seconds
45+
})
46+
})
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
const HYPERDRIVE_NAME = 'neon-test-hyperdrive';
10+
const HYPERDRIVE_DATABASE = 'neondb';
11+
const HYPERDRIVE_HOST = 'ep-late-cell-a4fm3g5p-pooler.us-east-1.aws.neon.tech';
12+
const HYPERDRIVE_PORT = 5432;
13+
const HYPERDRIVE_USER = 'neondb_owner';
14+
const HYPERDRIVE_PASSWORD = 'my-test-password';
15+
16+
eachModel('$modelName', ({ model }) => {
17+
describeEval('Hyperdrive Tool Evaluations', {
18+
data: async () => [
19+
{
20+
input: `Create a new Hyperdrive configuration with the name "${HYPERDRIVE_NAME}" and the database "${HYPERDRIVE_DATABASE}" and the host "${HYPERDRIVE_HOST}" and the port "${HYPERDRIVE_PORT}" and the user "${HYPERDRIVE_USER}" and the password "${HYPERDRIVE_PASSWORD}".`,
21+
expected: 'The hyperdrive_configs_create tool should be called to create a new hyperdrive configuration.',
22+
},
23+
],
24+
task: async (input: string) => {
25+
const client = await initializeClient(/* Pass necessary mocks/config */)
26+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
27+
28+
if (input.includes(`Create a new Hyperdrive configuration`)) {
29+
const toolCall = toolCalls.find((call) => call.toolName === 'hyperdrive_config_create')
30+
expect(toolCall, 'Tool hyperdrive_configs_create was not called').toBeDefined()
31+
}
32+
33+
return promptOutput
34+
},
35+
scorers: [checkFactuality],
36+
threshold: 1,
37+
timeout: 60000, // 60 seconds
38+
})
39+
})
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
eachModel('$modelName', ({ model }) => {
10+
describeEval('KV Namespaces Tool Evaluations', {
11+
data: async () => [
12+
{
13+
input: 'Create a new Cloudflare KV Namespace called "my-test-namespace".',
14+
expected: 'The kv_namespaces_create tool should be called to create a new kv namespace.',
15+
},
16+
{
17+
input: 'List all my Cloudflare KV Namespaces.',
18+
expected: 'The kv_namespaces_list tool should be called to retrieve the list of kv namespaces. There should be at least one kv namespace in the list.',
19+
},
20+
{
21+
input: 'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
22+
expected: 'The kv_namespace_update tool should be called to rename the kv namespace.',
23+
},
24+
{
25+
input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
26+
expected: 'The kv_namespace_get tool should be called to retrieve the details of the kv namespace.',
27+
},
28+
{
29+
input: 'Look up the id of my only KV namespace and delete it.',
30+
expected: 'The kv_namespace_delete tool should be called to delete the kv namespace.',
31+
},
32+
33+
34+
],
35+
task: async (input: string) => {
36+
const client = await initializeClient(/* Pass necessary mocks/config */)
37+
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
38+
39+
if (input.includes('List all my Cloudflare KV Namespaces')) {
40+
console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
41+
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
42+
expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
43+
}
44+
45+
return promptOutput
46+
},
47+
scorers: [checkFactuality],
48+
threshold: 1,
49+
timeout: 60000, // 60 seconds
50+
})
51+
})

0 commit comments

Comments
 (0)