Skip to content

Commit af7f16b

Browse files
committed
Add basic evals setup
1 parent 3e1f73e commit af7f16b

File tree

16 files changed

+12090
-38
lines changed

16 files changed

+12090
-38
lines changed
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import { describeEval } from "vitest-evals"
2+
import { eachModel } from "@repo/eval-tools/src/test-models"
3+
import { checkFactuality } from "@repo/eval-tools/src/scorers"
4+
import { ToolExecutionOptions, ToolSet, generateText, tool } from "ai"
5+
import { MCPClientManager } from "agents/mcp/client"
6+
import { runTask } from "./utils"
7+
8+
eachModel("$modelName", ({ model }) => {
9+
describeEval("Runs container initialize", {
10+
data: async () => [
11+
{
12+
input: "create and ping a container",
13+
expected: "The container_initialize tool was called and then the container_ping tool was called"
14+
}
15+
],
16+
task: async (input) => {
17+
return await runTask(model, input)
18+
},
19+
scorers: [checkFactuality],
20+
threshold: 1
21+
})
22+
})
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import { MCPClientManager } from "agents/mcp/client"
2+
import { LanguageModelV1, StepResult, ToolSet, generateText, streamText, tool } from "ai"
3+
import { jsonSchemaToZod, type JsonSchemaObject } from "@n8n/json-schema-to-zod";
4+
5+
export async function runTask(model: LanguageModelV1, input: string) {
6+
const clientManager = new MCPClientManager("test-client", "0.0.0")
7+
await clientManager.connect("http://localhost:8787/sse")
8+
9+
const tools = clientManager.listTools()
10+
const toolSet: ToolSet = tools.reduce((acc, v) => {
11+
acc[v.name] = tool({
12+
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
13+
description: v.description,
14+
execute: async (args, opts) => {
15+
console.log(`executing tool ${v.name}`)
16+
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
17+
console.log(res.toolResult)
18+
return res.content
19+
},
20+
})
21+
return acc
22+
}, {} as ToolSet)
23+
24+
const res = streamText({
25+
model,
26+
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
27+
tools: toolSet,
28+
prompt: input,
29+
maxRetries: 1,
30+
maxSteps: 10,
31+
})
32+
33+
for await (const part of res.fullStream) {
34+
35+
}
36+
37+
let messagesWithTools = ""
38+
const messages = (await res.response).messages
39+
for (const message of messages) {
40+
console.log(message.content)
41+
for (const messagePart of message.content) {
42+
if (typeof messagePart === "string") {
43+
messagesWithTools += `<message_content type="text">
44+
${messagePart}
45+
</message_content>`
46+
} else if (messagePart.type === "tool-call") {
47+
messagesWithTools += `<message_content type=${messagePart.type}>
48+
<tool_name>${messagePart.toolName}</tool_name>
49+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
50+
</message_content>`
51+
} else if (messagePart.type === "text") {
52+
messagesWithTools += `<message_content type="text">
53+
${messagePart.text}
54+
</message_content>`
55+
}
56+
}
57+
}
58+
59+
console.log(messagesWithTools)
60+
61+
return messagesWithTools
62+
}

apps/sandbox-container/package.json

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,31 @@
88
"dev": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:dev\"\"",
99
"build": "docker build .",
1010
"start": "wrangler dev",
11-
"start:container": "tsx container/index.ts"
11+
"start:container": "tsx container/index.ts",
12+
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\""
1213
},
1314
"dependencies": {
1415
"@cloudflare/workers-types": "^4.20250320.0",
1516
"@hono/node-server": "^1.13.8",
1617
"@hono/zod-validator": "^0.4.3",
1718
"@modelcontextprotocol/sdk": "^1.7.0",
19+
"@n8n/json-schema-to-zod": "^1.1.0",
20+
"@repo/eval-tools": "workspace:*",
1821
"@types/node": "^22.13.10",
19-
"agents": "^0.0.42",
22+
"agents": "^0.0.60",
2023
"cron-schedule": "^5.0.4",
2124
"esbuild": "^0.25.1",
2225
"hono": "^4.7.5",
2326
"mime": "^4.0.6",
2427
"octokit": "^4.1.2",
2528
"partyserver": "^0.0.65",
2629
"tsx": "^4.19.3",
30+
"vitest-evals": "^0.1.4",
2731
"workers-mcp": "0.1.0-3",
2832
"zod": "^3.24.2"
2933
},
3034
"devDependencies": {
35+
"ai": "^4.3.6",
3136
"concurrently": "^9.1.2",
3237
"wrangler": "^4.9.1"
3338
}
Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11
{
2-
"compilerOptions": {
3-
"target": "ESNext",
4-
"lib": ["ESNext", "DOM"],
5-
"jsx": "react-jsx",
6-
"module": "ESNext",
7-
"moduleResolution": "bundler",
8-
"types": ["@cloudflare/workers-types/2023-07-01"],
9-
"noEmit": true,
10-
"esModuleInterop": true,
11-
"forceConsistentCasingInFileNames": true,
12-
"strict": true,
13-
"skipLibCheck": true
14-
},
15-
"include": ["server/**.ts", "shared/**.ts"]
2+
"extends": "@repo/typescript-config/workers.json",
3+
"include": ["*/**.ts", "./vitest.config.evals.ts"],
4+
"exclude": ["container/**.ts"]
165
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { defineWorkersConfig } from "@cloudflare/vitest-pool-workers/config";
2+
3+
export default defineWorkersConfig({
4+
test: {
5+
include: ["**/*.eval.?(c|m)[jt]s?(x)"],
6+
poolOptions: {
7+
workers: {
8+
isolatedStorage: true,
9+
wrangler: { configPath: "./wrangler.jsonc" },
10+
},
11+
},
12+
},
13+
});

0 commit comments

Comments
 (0)