Skip to content

Commit 021b628

Browse files
Merge branch 'main' into wp-1582-courtney
2 parents da1ae7e + 7d6a3e4 commit 021b628

32 files changed

+11116
-5582
lines changed

.github/workflows/evals.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
name: Evals
2+
on:
3+
push:
4+
5+
jobs:
6+
test:
7+
runs-on: ubuntu-24.04
8+
strategy:
9+
matrix:
10+
node-version: [22]
11+
steps:
12+
- uses: actions/checkout@v4
13+
- name: Install pnpm
14+
uses: pnpm/action-setup@v4
15+
with:
16+
version: 10.8.0
17+
- name: Use Node.js ${{ matrix.node-version }}
18+
uses: actions/setup-node@v4
19+
with:
20+
node-version: ${{ matrix.node-version }}
21+
cache: 'pnpm'
22+
- name: Create .dev.vars file
23+
run: |
24+
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
25+
- name: Install dependencies
26+
run: pnpm install
27+
- name: Run evals
28+
run: pnpm eval

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Replace the content with the following configuration. Once you restart Claude De
2020
"command": "npx",
2121
"args": [
2222
"mcp-remote",
23-
"https://mcp.cloudflare.com/workers/observability/sse"
23+
"https://observability.mcp.cloudflare.com/sse"
2424
]
2525
}
2626
}

apps/sandbox-container/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Do the following from within the sandbox-container app:
1212
2. Get the Cloudflare client id and secret from a team member and add them to the `.dev.vars` file.
1313
3. Run `pnpm i` then `pnpm dev` to start the MCP server.
1414
4. Run `pnpx @modelcontextprotocol/inspector` to start the MCP inspector client.
15-
5. Open the inspector client in your browser and connect to the server via `http://localhost:8976/workers/sandbox/sse`.
15+
5. Open the inspector client in your browser and connect to the server via `http://localhost:8976/sse`.
1616

1717
Note: Temporary files created through files tool calls are stored in the workdir folder of this app.
1818

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
declare module 'cloudflare:test' {
2+
interface ProvidedEnv extends Env {}
3+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import { MCPClientManager } from 'agents/mcp/client'
2+
import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
3+
import { describeEval } from 'vitest-evals'
4+
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs container initialize', {
12+
data: async () => [
13+
{
14+
input: 'create and ping a container',
15+
expected:
16+
'The container_initialize tool was called and then the container_ping tool was called',
17+
},
18+
],
19+
task: async (input) => {
20+
return await runTask(model, input)
21+
},
22+
scorers: [checkFactuality],
23+
threshold: 1,
24+
})
25+
})
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
2+
import { MCPClientManager } from 'agents/mcp/client'
3+
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
4+
5+
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
6+
7+
export async function runTask(model: LanguageModelV1, input: string) {
8+
const clientManager = new MCPClientManager('test-client', '0.0.0')
9+
await clientManager.connect('http://localhost:8787/sse')
10+
11+
const tools = clientManager.listTools()
12+
const toolSet: ToolSet = tools.reduce((acc, v) => {
13+
acc[v.name] = tool({
14+
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
15+
description: v.description,
16+
execute: async (args, opts) => {
17+
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
18+
console.log(res.toolResult)
19+
return res.content
20+
},
21+
})
22+
return acc
23+
}, {} as ToolSet)
24+
25+
const res = streamText({
26+
model,
27+
system:
28+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
29+
tools: toolSet,
30+
prompt: input,
31+
maxRetries: 1,
32+
maxSteps: 10,
33+
})
34+
35+
for await (const part of res.fullStream) {
36+
}
37+
38+
// convert into an LLM readable result so our factuality checker can validate tool calls
39+
let messagesWithTools = ''
40+
const messages = (await res.response).messages
41+
for (const message of messages) {
42+
console.log(message.content)
43+
for (const messagePart of message.content) {
44+
if (typeof messagePart === 'string') {
45+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
46+
} else if (messagePart.type === 'tool-call') {
47+
messagesWithTools += `<message_content type=${messagePart.type}>
48+
<tool_name>${messagePart.toolName}</tool_name>
49+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
50+
</message_content>`
51+
} else if (messagePart.type === 'text') {
52+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
53+
}
54+
}
55+
}
56+
57+
return messagesWithTools
58+
}

apps/sandbox-container/package.json

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,38 @@
1212
"start:container": "tsx container/index.ts",
1313
"postinstall": "mkdir -p workdir",
1414
"test": "vitest",
15-
"types": "wrangler types"
15+
"types": "wrangler types",
16+
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
17+
"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
1618
},
1719
"dependencies": {
1820
"@cloudflare/workers-oauth-provider": "0.0.2",
1921
"@cloudflare/workers-types": "^4.20250320.0",
2022
"@hono/node-server": "^1.13.8",
2123
"@hono/zod-validator": "^0.4.3",
22-
"@modelcontextprotocol/sdk": "^1.7.0",
24+
"@modelcontextprotocol/sdk": "^1.9.0",
25+
"@n8n/json-schema-to-zod": "^1.1.0",
26+
"@repo/eval-tools": "workspace:*",
2327
"@repo/mcp-common": "workspace:*",
2428
"@types/node": "^22.13.10",
25-
"agents": "^0.0.42",
29+
"agents": "^0.0.60",
2630
"cron-schedule": "^5.0.4",
2731
"esbuild": "^0.25.1",
2832
"hono": "^4.7.5",
2933
"mime": "^4.0.6",
3034
"octokit": "^4.1.2",
3135
"partyserver": "^0.0.65",
36+
"simple-git-hooks": "^2.12.1",
3237
"tsx": "^4.19.3",
38+
"vitest-evals": "^0.1.4",
3339
"workers-mcp": "0.1.0-3",
3440
"zod": "^3.24.2"
3541
},
3642
"devDependencies": {
3743
"@types/mock-fs": "^4.13.4",
3844
"mock-fs": "^5.5.0",
45+
"@cloudflare/vitest-pool-workers": "0.8.14",
46+
"ai": "^4.3.6",
3947
"concurrently": "^9.1.2",
4048
"wrangler": "^4.9.1"
4149
}

apps/sandbox-container/server/index.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ export type Env = {
1717
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
1818
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
1919
ENVIRONMENT: 'dev' | 'prod'
20+
CLOUDFLARE_CLIENT_ID: string
21+
CLOUDFLARE_CLIENT_SECRET: string
2022
}
2123

2224
// Context from the auth process, encrypted & stored in the auth token
@@ -28,9 +30,9 @@ export type Props = {
2830
}
2931

3032
export default new OAuthProvider({
31-
apiRoute: '/workers/sandbox/sse',
33+
apiRoute: '/sse',
3234
// @ts-ignore
33-
apiHandler: ContainerMcpAgent.mount('/workers/sandbox/sse', { binding: 'CONTAINER_MCP_AGENT' }),
35+
apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
3436
// @ts-ignore
3537
defaultHandler: CloudflareAuthHandler,
3638
authorizeEndpoint: '/oauth/authorize',
Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11
{
2-
"compilerOptions": {
3-
"target": "ESNext",
4-
"lib": ["ESNext", "DOM"],
5-
"jsx": "react-jsx",
6-
"module": "ESNext",
7-
"moduleResolution": "bundler",
8-
"types": ["./worker-configuration.d.ts", "@cloudflare/workers-types/2023-07-01"],
9-
"noEmit": true,
10-
"esModuleInterop": true,
11-
"forceConsistentCasingInFileNames": true,
12-
"strict": true,
13-
"skipLibCheck": true
14-
},
15-
"include": ["server/**.ts", "shared/**.ts"]
2+
"extends": "@repo/typescript-config/workers.json",
3+
"include": ["*/**.ts", "./vitest.config.evals.ts"],
4+
"exclude": ["container/**.ts"]
165
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
2+
3+
export default defineWorkersConfig({
4+
test: {
5+
include: ['**/*.eval.?(c|m)[jt]s?(x)'],
6+
poolOptions: {
7+
workers: {
8+
isolatedStorage: true,
9+
wrangler: { configPath: './wrangler.jsonc' },
10+
},
11+
},
12+
},
13+
})

0 commit comments

Comments
 (0)