Skip to content

Commit 4717f26

Browse files
authored
Merge pull request #65 from cloudflare/csparks/evals-improvements
Add file_write eval and refine evals to include the correct tool call metadata
2 parents 7383b5c + c338d3d commit 4717f26

File tree

13 files changed

+416
-54
lines changed

13 files changed

+416
-54
lines changed

.github/workflows/evals.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ on:
33
push:
44

55
jobs:
6-
test:
6+
eval:
77
runs-on: ubuntu-24.04
88
strategy:
99
matrix:
@@ -22,7 +22,10 @@ jobs:
2222
- name: Create .dev.vars file
2323
run: |
2424
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
25+
- name: Verify .dev.vars file
26+
run: |
27+
du -h ./apps/sandbox-container/.dev.vars
2528
- name: Install dependencies
2629
run: pnpm install
2730
- name: Run evals
28-
run: pnpm eval
31+
run: pnpm eval:ci

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,5 @@ yarn-error.log*
4848
.sentryclirc.lock/
4949
tmp.json
5050
tmp.ts
51+
52+
apps/sandbox-container/workdir
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { assert, expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
import { z } from 'zod'
4+
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { initializeClient, runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs container file write', {
12+
data: async () => [
13+
{
14+
input: 'write a file named test.txt containing the text "asdf"',
15+
expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
16+
},
17+
],
18+
task: async (input) => {
19+
const client = await initializeClient()
20+
const promptOutput = await runTask(client, model, input)
21+
const fileRead = client.listTools().find((tool) => {
22+
if (tool.name === 'container_file_read') {
23+
return tool
24+
}
25+
})
26+
27+
assert(fileRead !== undefined)
28+
const result = await client.callTool(
29+
{
30+
...fileRead,
31+
arguments: {
32+
args: { path: 'file://test.txt' },
33+
},
34+
},
35+
z.any() as any,
36+
{}
37+
)
38+
39+
expect(result.content).toStrictEqual([
40+
{
41+
type: 'resource',
42+
resource: {
43+
uri: 'file://test.txt',
44+
mimeType: 'text/plain',
45+
text: 'asdf',
46+
},
47+
},
48+
])
49+
50+
return promptOutput
51+
},
52+
scorers: [checkFactuality],
53+
threshold: 1,
54+
})
55+
})

apps/sandbox-container/evals/initialize.eval.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
import { MCPClientManager } from 'agents/mcp/client'
2-
import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
31
import { describeEval } from 'vitest-evals'
42

53
import { checkFactuality } from '@repo/eval-tools/src/scorers'
64
import { eachModel } from '@repo/eval-tools/src/test-models'
75

8-
import { runTask } from './utils'
6+
import { initializeClient, runTask } from './utils'
97

108
eachModel('$modelName', ({ model }) => {
119
describeEval('Runs container initialize', {
@@ -17,7 +15,8 @@ eachModel('$modelName', ({ model }) => {
1715
},
1816
],
1917
task: async (input) => {
20-
return await runTask(model, input)
18+
const client = await initializeClient()
19+
return await runTask(client, model, input)
2120
},
2221
scorers: [checkFactuality],
2322
threshold: 1,

apps/sandbox-container/evals/utils.ts

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,42 @@
11
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
22
import { MCPClientManager } from 'agents/mcp/client'
33
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
4+
import { z } from 'zod'
45

56
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
67

7-
export async function runTask(model: LanguageModelV1, input: string) {
8+
export async function initializeClient(): Promise<MCPClientManager> {
89
const clientManager = new MCPClientManager('test-client', '0.0.0')
9-
await clientManager.connect('http://localhost:8787/sse')
10+
await clientManager.connect('http://localhost:8976/sse')
11+
return clientManager
12+
}
1013

14+
export async function runTask(
15+
clientManager: MCPClientManager,
16+
model: LanguageModelV1,
17+
input: string
18+
) {
1119
const tools = clientManager.listTools()
1220
const toolSet: ToolSet = tools.reduce((acc, v) => {
1321
acc[v.name] = tool({
1422
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
1523
description: v.description,
1624
execute: async (args, opts) => {
17-
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
18-
console.log(res.toolResult)
19-
return res.content
25+
try {
26+
const res = await clientManager.callTool(
27+
{
28+
...v,
29+
arguments: { ...args },
30+
},
31+
z.any() as any,
32+
{ signal: opts.abortSignal }
33+
)
34+
return res.content
35+
} catch (e) {
36+
console.log('Error calling tool')
37+
console.log(e)
38+
return e
39+
}
2040
},
2141
})
2242
return acc

apps/sandbox-container/package.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
"postinstall": "mkdir -p workdir",
1414
"test": "vitest",
1515
"types": "wrangler types",
16-
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
17-
"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
16+
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest --config vitest.config.evals.ts'",
17+
"eval:server": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:test\"\"",
18+
"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest run --config vitest.config.evals.ts'"
1819
},
1920
"dependencies": {
2021
"@cloudflare/workers-oauth-provider": "0.0.2",
@@ -40,11 +41,12 @@
4041
"zod": "^3.24.2"
4142
},
4243
"devDependencies": {
43-
"@types/mock-fs": "^4.13.4",
44-
"mock-fs": "^5.5.0",
4544
"@cloudflare/vitest-pool-workers": "0.8.14",
45+
"@types/mock-fs": "^4.13.4",
4646
"ai": "^4.3.6",
4747
"concurrently": "^9.1.2",
48+
"mock-fs": "^5.5.0",
49+
"start-server-and-test": "^2.0.11",
4850
"wrangler": "^4.9.1"
4951
}
5052
}

apps/sandbox-container/server/containerHelpers.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
export const MAX_CONTAINERS = 8
22
export async function startAndWaitForPort(
3-
environment: 'dev' | 'prod',
3+
environment: 'dev' | 'prod' | 'test',
44
container: Container | undefined,
55
portToAwait: number,
66
maxTries = 10
77
): Promise<boolean> {
8-
if (environment === 'dev') {
8+
if (environment === 'dev' || environment === 'test') {
99
console.log('Running in dev, assuming locally running container')
1010
return true
1111
}
@@ -62,12 +62,12 @@ export async function startAndWaitForPort(
6262
}
6363

6464
export async function proxyFetch(
65-
environment: 'dev' | 'prod',
65+
environment: 'dev' | 'prod' | 'test',
6666
container: Container | undefined,
6767
request: Request,
6868
portNumber: number
6969
): Promise<Response> {
70-
if (environment === 'dev') {
70+
if (environment === 'dev' || environment === 'test') {
7171
const url = request.url
7272
.replace('https://', 'http://')
7373
.replace('http://host', 'http://localhost')

apps/sandbox-container/server/index.ts

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import OAuthProvider from '@cloudflare/workers-oauth-provider'
2+
import { McpAgent } from 'agents/mcp'
23
import { env } from 'cloudflare:workers'
34

45
import {
@@ -16,7 +17,7 @@ export { ContainerManager, ContainerMcpAgent }
1617
export type Env = {
1718
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
1819
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
19-
ENVIRONMENT: 'dev' | 'prod'
20+
ENVIRONMENT: 'dev' | 'prod' | 'test'
2021
CLOUDFLARE_CLIENT_ID: string
2122
CLOUDFLARE_CLIENT_SECRET: string
2223
}
@@ -38,17 +39,36 @@ const ContainerScopes = {
3839
offline_access: 'Grants refresh tokens for long-lived access.',
3940
} as const
4041

41-
export default new OAuthProvider({
42-
apiRoute: '/sse',
43-
// @ts-ignore
44-
apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
45-
// @ts-ignore
46-
defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
47-
authorizeEndpoint: '/oauth/authorize',
48-
tokenEndpoint: '/token',
49-
tokenExchangeCallback: (options) =>
50-
handleTokenExchangeCallback(options, env.CLOUDFLARE_CLIENT_ID, env.CLOUDFLARE_CLIENT_SECRET),
51-
// Cloudflare access token TTL
52-
accessTokenTTL: 3600,
53-
clientRegistrationEndpoint: '/register',
54-
})
42+
export default {
43+
fetch: (req: Request, env: Env, ctx: ExecutionContext) => {
44+
if (env.ENVIRONMENT === 'test') {
45+
ctx.props = {}
46+
return ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }).fetch(
47+
req,
48+
env as Record<string, DurableObjectNamespace<McpAgent> | any>,
49+
ctx
50+
)
51+
}
52+
53+
return new OAuthProvider({
54+
apiRoute: '/sse',
55+
// @ts-ignore
56+
apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
57+
// @ts-ignore
58+
defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
59+
authorizeEndpoint: '/oauth/authorize',
60+
tokenEndpoint: '/token',
61+
tokenExchangeCallback: (options) =>
62+
handleTokenExchangeCallback(
63+
options,
64+
env.CLOUDFLARE_CLIENT_ID,
65+
env.CLOUDFLARE_CLIENT_SECRET
66+
),
67+
// Cloudflare access token TTL
68+
accessTokenTTL: 3600,
69+
clientRegistrationEndpoint: '/register',
70+
}).fetch(req, env, ctx)
71+
},
72+
} /*
73+
74+
*/

apps/sandbox-container/vitest.config.evals.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ export default defineWorkersConfig({
77
workers: {
88
isolatedStorage: true,
99
wrangler: { configPath: './wrangler.jsonc' },
10+
miniflare: {
11+
bindings: {
12+
ENVIRONMENT: 'test',
13+
},
14+
},
1015
},
1116
},
1217
},

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"test": "vitest run",
2626
"fix:format": "prettier . --write",
2727
"test:watch": "vitest",
28-
"eval": "run-turbo eval"
28+
"eval:ci": "run-turbo eval:ci"
2929
},
3030
"devDependencies": {
3131
"@changesets/cli": "2.28.1",

0 commit comments

Comments
 (0)