Skip to content

Commit 69c8735

Browse files
committed
Add file_write eval and refine evals to include the correct tool calling data
1 parent 48a91be commit 69c8735

File tree

11 files changed

+142
-49
lines changed

11 files changed

+142
-49
lines changed

.github/workflows/evals.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ jobs:
2525
- name: Install dependencies
2626
run: pnpm install
2727
- name: Run evals
28-
run: pnpm eval
28+
run: pnpm eval:ci
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import { describeEval } from 'vitest-evals'
2+
import { assert } from "vitest"
3+
import { z } from 'zod'
4+
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { initializeClient, runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs container initialize', {
12+
data: async () => [
13+
{
14+
input: 'write a file named test.txt containing the text "asdf"',
15+
expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
16+
},
17+
],
18+
task: async (input) => {
19+
const client = await initializeClient()
20+
const promptOutput = await runTask(client, model, input)
21+
const fileRead = client.listTools().find((tool) => {
22+
if (tool.name === 'container_file_read') {
23+
return tool
24+
}
25+
})
26+
27+
assert(fileRead !== undefined)
28+
await client.callTool(
29+
{
30+
...fileRead,
31+
arguments: {
32+
args: { path: 'file://test.txt' },
33+
},
34+
},
35+
z.any() as any,
36+
{}
37+
)
38+
return promptOutput
39+
},
40+
scorers: [checkFactuality],
41+
threshold: 1,
42+
})
43+
})

apps/sandbox-container/evals/initialize.eval.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
import { MCPClientManager } from 'agents/mcp/client'
2-
import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
31
import { describeEval } from 'vitest-evals'
42

53
import { checkFactuality } from '@repo/eval-tools/src/scorers'
64
import { eachModel } from '@repo/eval-tools/src/test-models'
75

8-
import { runTask } from './utils'
6+
import { initializeClient, runTask } from './utils'
97

108
eachModel('$modelName', ({ model }) => {
119
describeEval('Runs container initialize', {
@@ -17,7 +15,8 @@ eachModel('$modelName', ({ model }) => {
1715
},
1816
],
1917
task: async (input) => {
20-
return await runTask(model, input)
18+
const client = await initializeClient()
19+
return await runTask(client, model, input)
2120
},
2221
scorers: [checkFactuality],
2322
threshold: 1,

apps/sandbox-container/evals/utils.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,32 @@ import { MCPClientManager } from 'agents/mcp/client'
33
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
44

55
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
6+
import { z } from 'zod'
67

7-
export async function runTask(model: LanguageModelV1, input: string) {
8+
export async function initializeClient(): Promise<MCPClientManager> {
89
const clientManager = new MCPClientManager('test-client', '0.0.0')
9-
await clientManager.connect('http://localhost:8787/sse')
10+
await clientManager.connect('http://localhost:8976/sse')
11+
return clientManager
12+
}
1013

14+
export async function runTask(clientManager: MCPClientManager, model: LanguageModelV1, input: string) {
1115
const tools = clientManager.listTools()
1216
const toolSet: ToolSet = tools.reduce((acc, v) => {
1317
acc[v.name] = tool({
1418
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
1519
description: v.description,
1620
execute: async (args, opts) => {
17-
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
18-
console.log(res.toolResult)
19-
return res.content
21+
try {
22+
const res = await clientManager.callTool({
23+
...v,
24+
arguments: { ...args },
25+
}, z.any() as any, { signal: opts.abortSignal })
26+
return res.content
27+
} catch (e) {
28+
console.log("Error calling tool")
29+
console.log(e)
30+
return e
31+
}
2032
},
2133
})
2234
return acc

apps/sandbox-container/package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
"postinstall": "mkdir -p workdir",
1414
"test": "vitest",
1515
"types": "wrangler types",
16-
"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
17-
"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
16+
"eval:dev": "concurrently \"npm run eval:server\" \"vitest --config vitest.config.evals.ts\"",
17+
"eval:server": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:test\"\"",
18+
"eval:ci": "npm run eval:server & wait-port 8976 && vitest run --config vitest.config.evals.ts"
1819
},
1920
"dependencies": {
2021
"@cloudflare/workers-oauth-provider": "0.0.2",

apps/sandbox-container/server/containerHelpers.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
export const MAX_CONTAINERS = 8
22
export async function startAndWaitForPort(
3-
environment: 'dev' | 'prod',
3+
environment: 'dev' | 'prod' | 'test',
44
container: Container | undefined,
55
portToAwait: number,
66
maxTries = 10
77
): Promise<boolean> {
8-
if (environment === 'dev') {
8+
if (environment === 'dev' || environment === "test") {
99
console.log('Running in dev, assuming locally running container')
1010
return true
1111
}
@@ -62,12 +62,12 @@ export async function startAndWaitForPort(
6262
}
6363

6464
export async function proxyFetch(
65-
environment: 'dev' | 'prod',
65+
environment: 'dev' | 'prod' | 'test',
6666
container: Container | undefined,
6767
request: Request,
6868
portNumber: number
6969
): Promise<Response> {
70-
if (environment === 'dev') {
70+
if (environment === 'dev' || environment === "test") {
7171
const url = request.url
7272
.replace('https://', 'http://')
7373
.replace('http://host', 'http://localhost')

apps/sandbox-container/server/index.ts

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@ import { ContainerManager } from './containerManager'
1010
import { ContainerMcpAgent } from './containerMcp'
1111

1212
import type { AccountSchema, UserSchema } from '@repo/mcp-common/src/cloudflare-oauth-handler'
13+
import { McpAgent } from 'agents/mcp'
1314

1415
export { ContainerManager, ContainerMcpAgent }
1516

1617
export type Env = {
1718
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
1819
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
19-
ENVIRONMENT: 'dev' | 'prod'
20+
ENVIRONMENT: 'dev' | 'prod' | "test"
2021
CLOUDFLARE_CLIENT_ID: string
2122
CLOUDFLARE_CLIENT_SECRET: string
2223
}
@@ -38,17 +39,28 @@ const ContainerScopes = {
3839
offline_access: 'Grants refresh tokens for long-lived access.',
3940
} as const
4041

41-
export default new OAuthProvider({
42-
apiRoute: '/sse',
43-
// @ts-ignore
44-
apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
45-
// @ts-ignore
46-
defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
47-
authorizeEndpoint: '/oauth/authorize',
48-
tokenEndpoint: '/token',
49-
tokenExchangeCallback: (options) =>
50-
handleTokenExchangeCallback(options, env.CLOUDFLARE_CLIENT_ID, env.CLOUDFLARE_CLIENT_SECRET),
51-
// Cloudflare access token TTL
52-
accessTokenTTL: 3600,
53-
clientRegistrationEndpoint: '/register',
54-
})
42+
export default {
43+
fetch: (req: Request, env: Env, ctx: ExecutionContext) => {
44+
if (env.ENVIRONMENT === "test") {
45+
ctx.props = {}
46+
return ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }).fetch(req, env as Record<string, DurableObjectNamespace<McpAgent> | any>, ctx)
47+
}
48+
49+
return new OAuthProvider({
50+
apiRoute: '/sse',
51+
// @ts-ignore
52+
apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
53+
// @ts-ignore
54+
defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
55+
authorizeEndpoint: '/oauth/authorize',
56+
tokenEndpoint: '/token',
57+
tokenExchangeCallback: (options) =>
58+
handleTokenExchangeCallback(options, env.CLOUDFLARE_CLIENT_ID, env.CLOUDFLARE_CLIENT_SECRET),
59+
// Cloudflare access token TTL
60+
accessTokenTTL: 3600,
61+
clientRegistrationEndpoint: '/register',
62+
}).fetch(req, env, ctx)
63+
}
64+
}/*
65+
66+
*/

apps/sandbox-container/vitest.config.evals.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ export default defineWorkersConfig({
77
workers: {
88
isolatedStorage: true,
99
wrangler: { configPath: './wrangler.jsonc' },
10+
miniflare: {
11+
bindings: {
12+
ENVIRONMENT: "test"
13+
}
14+
}
1015
},
1116
},
1217
},

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"test": "vitest run",
2626
"fix:format": "prettier . --write",
2727
"test:watch": "vitest",
28-
"eval": "run-turbo eval"
28+
"eval:ci": "run-turbo eval:ci"
2929
},
3030
"devDependencies": {
3131
"@changesets/cli": "2.28.1",

packages/eval-tools/src/scorers.ts

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,27 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
1919
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
2020
*/
2121
prompt: `
22-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
22+
You are comparing a submitted answer to an expert's rubric on a given question. Here is the data:
2323
[BEGIN DATA]
2424
************
2525
[Question]: ${input}
2626
************
27-
[Expert]: ${expected}
27+
[Expert Rubric]: ${expected}
2828
************
2929
[Submission]: ${output}
3030
************
3131
[END DATA]
32+
33+
Submissions contain message metadata inside of the <message_content> XML tags.
34+
The attribute \`type=text\` indicates text content. The attribute \`type=tool-call\` indicates a tool call.
35+
Use this metadata to determine the accuracy of the response.
3236
33-
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
34-
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
35-
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
36-
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
37-
(C) The submitted answer contains all the same details as the expert answer.
38-
(D) There is a disagreement between the submitted answer and the expert answer.
37+
Compare the factual content of the submitted answer with the expert's answer rubric. Ignore any differences in style, grammar, or punctuation.
38+
The submitted answer may either be a subset or superset of the expert's expected answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
39+
(A) The submitted answer is a subset of the answer the expert's rubric describes and is fully consistent with it.
40+
(B) The submitted answer is a superset of the answer the expert's rubric describes and is fully consistent with it.
41+
(C) The submitted answer contains all the same details of the answer the expert's rubric describes.
42+
(D) There is a disagreement between the submitted answer and the expert's rubric.
3943
(E) The answers differ, but these differences don't matter from the perspective of factuality.
4044
`,
4145
schema: z.object({
@@ -49,7 +53,7 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
4953
*/
5054
const scores = {
5155
A: 0.4,
52-
B: 0.6,
56+
B: 1,
5357
C: 1,
5458
D: 0,
5559
E: 1,

0 commit comments

Comments
 (0)