Skip to content

Commit ca36970

Browse files
committed
Add basic evals setup which runs locally
1 parent 7b2f086 commit ca36970

File tree

18 files changed

+11000
-5531
lines changed

18 files changed

+11000
-5531
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
declare module 'cloudflare:test' {
2+
interface ProvidedEnv extends Env {}
3+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import { describeEval } from "vitest-evals"
2+
import { eachModel } from "@repo/eval-tools/src/test-models"
3+
import { checkFactuality } from "@repo/eval-tools/src/scorers"
4+
import { ToolExecutionOptions, ToolSet, generateText, tool } from "ai"
5+
import { MCPClientManager } from "agents/mcp/client"
6+
import { runTask } from "./utils"
7+
8+
eachModel("$modelName", ({ model }) => {
9+
describeEval("Runs container initialize", {
10+
data: async () => [
11+
{
12+
input: "create and ping a container",
13+
expected: "The container_initialize tool was called and then the container_ping tool was called"
14+
}
15+
],
16+
task: async (input) => {
17+
return await runTask(model, input)
18+
},
19+
scorers: [checkFactuality],
20+
threshold: 1
21+
})
22+
})
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { MCPClientManager } from "agents/mcp/client"
2+
import { LanguageModelV1, ToolSet, streamText, tool } from "ai"
3+
import { jsonSchemaToZod, type JsonSchemaObject } from "@n8n/json-schema-to-zod";
4+
5+
export async function runTask(model: LanguageModelV1, input: string) {
6+
const clientManager = new MCPClientManager("test-client", "0.0.0")
7+
await clientManager.connect("http://localhost:8787/sse")
8+
9+
const tools = clientManager.listTools()
10+
const toolSet: ToolSet = tools.reduce((acc, v) => {
11+
acc[v.name] = tool({
12+
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
13+
description: v.description,
14+
execute: async (args, opts) => {
15+
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
16+
console.log(res.toolResult)
17+
return res.content
18+
},
19+
})
20+
return acc
21+
}, {} as ToolSet)
22+
23+
const res = streamText({
24+
model,
25+
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
26+
tools: toolSet,
27+
prompt: input,
28+
maxRetries: 1,
29+
maxSteps: 10,
30+
})
31+
32+
for await (const part of res.fullStream) {
33+
34+
}
35+
36+
// convert into an LLM readable result so our factuality checker can validate tool calls
37+
let messagesWithTools = ""
38+
const messages = (await res.response).messages
39+
for (const message of messages) {
40+
console.log(message.content)
41+
for (const messagePart of message.content) {
42+
if (typeof messagePart === "string") {
43+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
44+
} else if (messagePart.type === "tool-call") {
45+
messagesWithTools += `<message_content type=${messagePart.type}>
46+
<tool_name>${messagePart.toolName}</tool_name>
47+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
48+
</message_content>`
49+
} else if (messagePart.type === "text") {
50+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
51+
}
52+
}
53+
}
54+
55+
return messagesWithTools
56+
}

apps/sandbox-container/package.json

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,34 @@
1212
"start:container": "tsx container/index.ts",
1313
"postinstall": "mkdir -p workdir",
1414
"test": "vitest",
15-
"types": "wrangler types"
15+
"types": "wrangler types",
16+
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\""
1617
},
1718
"dependencies": {
1819
"@cloudflare/workers-oauth-provider": "0.0.2",
1920
"@cloudflare/workers-types": "^4.20250320.0",
2021
"@hono/node-server": "^1.13.8",
2122
"@hono/zod-validator": "^0.4.3",
22-
"@modelcontextprotocol/sdk": "^1.7.0",
23+
"@modelcontextprotocol/sdk": "^1.9.0",
24+
"@n8n/json-schema-to-zod": "^1.1.0",
25+
"@repo/eval-tools": "workspace:*",
26+
"@repo/mcp-common": "workspace:*",
2327
"@types/node": "^22.13.10",
24-
"agents": "^0.0.42",
28+
"agents": "^0.0.60",
2529
"cron-schedule": "^5.0.4",
2630
"esbuild": "^0.25.1",
2731
"hono": "^4.7.5",
2832
"mime": "^4.0.6",
2933
"octokit": "^4.1.2",
3034
"partyserver": "^0.0.65",
35+
"simple-git-hooks": "^2.12.1",
3136
"tsx": "^4.19.3",
37+
"vitest-evals": "^0.1.4",
3238
"workers-mcp": "0.1.0-3",
33-
"zod": "^3.24.2",
34-
"@repo/mcp-common": "workspace:*"
39+
"zod": "^3.24.2"
3540
},
3641
"devDependencies": {
42+
"ai": "^4.3.6",
3743
"concurrently": "^9.1.2",
3844
"wrangler": "^4.9.1"
3945
}

apps/sandbox-container/server/index.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ export { ContainerManager, ContainerMcpAgent }
1616
export type Env = {
1717
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
1818
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
19-
ENVIRONMENT: 'dev' | 'prod'
19+
ENVIRONMENT: 'dev' | 'prod',
20+
CLOUDFLARE_CLIENT_ID: string,
21+
CLOUDFLARE_CLIENT_SECRET: string,
2022
}
2123

2224
// Context from the auth process, encrypted & stored in the auth token
Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11
{
2-
"compilerOptions": {
3-
"target": "ESNext",
4-
"lib": ["ESNext", "DOM"],
5-
"jsx": "react-jsx",
6-
"module": "ESNext",
7-
"moduleResolution": "bundler",
8-
"types": ["./worker-configuration.d.ts", "@cloudflare/workers-types/2023-07-01"],
9-
"noEmit": true,
10-
"esModuleInterop": true,
11-
"forceConsistentCasingInFileNames": true,
12-
"strict": true,
13-
"skipLibCheck": true
14-
},
15-
"include": ["server/**.ts", "shared/**.ts"]
2+
"extends": "@repo/typescript-config/workers.json",
3+
"include": ["*/**.ts", "./vitest.config.evals.ts"],
4+
"exclude": ["container/**.ts"]
165
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { defineWorkersConfig } from "@cloudflare/vitest-pool-workers/config";
2+
3+
export default defineWorkersConfig({
4+
test: {
5+
include: ["**/*.eval.?(c|m)[jt]s?(x)"],
6+
poolOptions: {
7+
workers: {
8+
isolatedStorage: true,
9+
wrangler: { configPath: "./wrangler.jsonc" },
10+
},
11+
},
12+
},
13+
});

0 commit comments

Comments
 (0)