Add file_write eval and refine evals to include the correct tool calling data

cmsparks · cmsparks · commit 69c873571a3f · 2025-04-22T10:18:09.000-05:00
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
@@ -25,4 +25,4 @@ jobs:
       - name: Install dependencies
         run: pnpm install
       - name: Run evals
-        run: pnpm eval
+        run: pnpm eval:ci
diff --git a/apps/sandbox-container/evals/file_write.eval.ts b/apps/sandbox-container/evals/file_write.eval.ts
@@ -0,0 +1,43 @@
+import { describeEval } from 'vitest-evals'
+import { assert } from "vitest"
+import { z } from 'zod'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs container initialize', {
+		data: async () => [
+			{
+				input: 'write a file named test.txt containing the text "asdf"',
+				expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
+			},
+		],
+		task: async (input) => {
+			const client = await initializeClient()
+			const promptOutput = await runTask(client, model, input)
+			const fileRead = client.listTools().find((tool) => {
+				if (tool.name === 'container_file_read') {
+					return tool
+				}
+			})
+
+			assert(fileRead !== undefined)
+			await client.callTool(
+				{
+					...fileRead,
+					arguments: {
+						args: { path: 'file://test.txt' },
+					},
+				},
+				z.any() as any,
+				{}
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
diff --git a/apps/sandbox-container/evals/initialize.eval.ts b/apps/sandbox-container/evals/initialize.eval.ts
@@ -1,11 +1,9 @@
-import { MCPClientManager } from 'agents/mcp/client'
-import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
 import { describeEval } from 'vitest-evals'
 
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 
-import { runTask } from './utils'
+import { initializeClient, runTask } from './utils'
 
 eachModel('$modelName', ({ model }) => {
 	describeEval('Runs container initialize', {
@@ -17,7 +15,8 @@ eachModel('$modelName', ({ model }) => {
 			},
 		],
 		task: async (input) => {
-			return await runTask(model, input)
+			const client = await initializeClient()
+			return await runTask(client, model, input)
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
diff --git a/apps/sandbox-container/evals/utils.ts b/apps/sandbox-container/evals/utils.ts
@@ -3,20 +3,32 @@ import { MCPClientManager } from 'agents/mcp/client'
 import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
 
 import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
+import { z } from 'zod'
 
-export async function runTask(model: LanguageModelV1, input: string) {
+export async function initializeClient(): Promise<MCPClientManager> {
 	const clientManager = new MCPClientManager('test-client', '0.0.0')
-	await clientManager.connect('http://localhost:8787/sse')
+	await clientManager.connect('http://localhost:8976/sse')
+	return clientManager
+}
 
+export async function runTask(clientManager: MCPClientManager, model: LanguageModelV1, input: string) {
 	const tools = clientManager.listTools()
 	const toolSet: ToolSet = tools.reduce((acc, v) => {
 		acc[v.name] = tool({
 			parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
 			description: v.description,
 			execute: async (args, opts) => {
-				const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
-				console.log(res.toolResult)
-				return res.content
+				try {
+					const res = await clientManager.callTool({
+						...v,
+						arguments: { ...args },
+					}, z.any() as any, { signal: opts.abortSignal })
+					return res.content
+				} catch (e) {
+					console.log("Error calling tool")
+					console.log(e)
+					return e
+				}
 			},
 		})
 		return acc
diff --git a/apps/sandbox-container/package.json b/apps/sandbox-container/package.json
@@ -13,8 +13,9 @@
 		"postinstall": "mkdir -p workdir",
 		"test": "vitest",
 		"types": "wrangler types",
-		"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
-		"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
+		"eval:dev": "concurrently \"npm run eval:server\" \"vitest --config vitest.config.evals.ts\"",
+		"eval:server": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:test\"\"",
+		"eval:ci": "npm run eval:server & wait-port 8976 && vitest run --config vitest.config.evals.ts"
 	},
 	"dependencies": {
 		"@cloudflare/workers-oauth-provider": "0.0.2",
diff --git a/apps/sandbox-container/server/containerHelpers.ts b/apps/sandbox-container/server/containerHelpers.ts
@@ -1,11 +1,11 @@
 export const MAX_CONTAINERS = 8
 export async function startAndWaitForPort(
-	environment: 'dev' | 'prod',
+	environment: 'dev' | 'prod' | 'test',
 	container: Container | undefined,
 	portToAwait: number,
 	maxTries = 10
 ): Promise<boolean> {
-	if (environment === 'dev') {
+	if (environment === 'dev' || environment === "test") {
 		console.log('Running in dev, assuming locally running container')
 		return true
 	}
@@ -62,12 +62,12 @@ export async function startAndWaitForPort(
 }
 
 export async function proxyFetch(
-	environment: 'dev' | 'prod',
+	environment: 'dev' | 'prod' | 'test',
 	container: Container | undefined,
 	request: Request,
 	portNumber: number
 ): Promise<Response> {
-	if (environment === 'dev') {
+	if (environment === 'dev' || environment === "test") {
 		const url = request.url
 			.replace('https://', 'http://')
 			.replace('http://host', 'http://localhost')
diff --git a/apps/sandbox-container/server/index.ts b/apps/sandbox-container/server/index.ts
@@ -10,13 +10,14 @@ import { ContainerManager } from './containerManager'
 import { ContainerMcpAgent } from './containerMcp'
 
 import type { AccountSchema, UserSchema } from '@repo/mcp-common/src/cloudflare-oauth-handler'
+import { McpAgent } from 'agents/mcp'
 
 export { ContainerManager, ContainerMcpAgent }
 
 export type Env = {
 	CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
 	CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
-	ENVIRONMENT: 'dev' | 'prod'
+	ENVIRONMENT: 'dev' | 'prod' | "test"
 	CLOUDFLARE_CLIENT_ID: string
 	CLOUDFLARE_CLIENT_SECRET: string
 }
@@ -38,17 +39,28 @@ const ContainerScopes = {
 	offline_access: 'Grants refresh tokens for long-lived access.',
 } as const
 
-export default new OAuthProvider({
-	apiRoute: '/sse',
-	// @ts-ignore
-	apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
-	// @ts-ignore
-	defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
-	authorizeEndpoint: '/oauth/authorize',
-	tokenEndpoint: '/token',
-	tokenExchangeCallback: (options) =>
-		handleTokenExchangeCallback(options, env.CLOUDFLARE_CLIENT_ID, env.CLOUDFLARE_CLIENT_SECRET),
-	// Cloudflare access token TTL
-	accessTokenTTL: 3600,
-	clientRegistrationEndpoint: '/register',
-})
+export default {
+	fetch: (req: Request, env: Env, ctx: ExecutionContext) => {
+		if (env.ENVIRONMENT === "test") {
+			ctx.props = {}
+			return ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }).fetch(req, env as Record<string, DurableObjectNamespace<McpAgent> | any>, ctx)
+		}
+
+		return new OAuthProvider({
+			apiRoute: '/sse',
+			// @ts-ignore
+			apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
+			// @ts-ignore
+			defaultHandler: createAuthHandlers({ scopes: ContainerScopes }),
+			authorizeEndpoint: '/oauth/authorize',
+			tokenEndpoint: '/token',
+			tokenExchangeCallback: (options) =>
+				handleTokenExchangeCallback(options, env.CLOUDFLARE_CLIENT_ID, env.CLOUDFLARE_CLIENT_SECRET),
+			// Cloudflare access token TTL
+			accessTokenTTL: 3600,
+			clientRegistrationEndpoint: '/register',
+		}).fetch(req, env, ctx)
+	}
+}/*
+	
+*/
diff --git a/apps/sandbox-container/vitest.config.evals.ts b/apps/sandbox-container/vitest.config.evals.ts
@@ -7,6 +7,11 @@ export default defineWorkersConfig({
 			workers: {
 				isolatedStorage: true,
 				wrangler: { configPath: './wrangler.jsonc' },
+				miniflare: {
+					bindings: {
+						ENVIRONMENT: "test"
+					}
+				}
 			},
 		},
 	},
diff --git a/package.json b/package.json
@@ -25,7 +25,7 @@
 		"test": "vitest run",
 		"fix:format": "prettier . --write",
 		"test:watch": "vitest",
-		"eval": "run-turbo eval"
+		"eval:ci": "run-turbo eval:ci"
 	},
 	"devDependencies": {
 		"@changesets/cli": "2.28.1",
diff --git a/packages/eval-tools/src/scorers.ts b/packages/eval-tools/src/scorers.ts
@@ -19,23 +19,27 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
 		 * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
 		 */
 		prompt: `
-        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+        You are comparing a submitted answer to an expert's rubric on a given question. Here is the data:
         [BEGIN DATA]
         ************
         [Question]: ${input}
         ************
-        [Expert]: ${expected}
+        [Expert Rubric]: ${expected}
         ************
         [Submission]: ${output}
         ************
         [END DATA]
+
+		Submissions contain message metadata inside of the <message_content> XML tags. 
+		The attribute \`type=text\` indicates text content. The attribute \`type=tool-call\` indicates a tool call. 
+		Use this metadata to determine the accuracy of the response.
   
-        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
-        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
-        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
-        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
-        (C) The submitted answer contains all the same details as the expert answer.
-        (D) There is a disagreement between the submitted answer and the expert answer.
+        Compare the factual content of the submitted answer with the expert's answer rubric. Ignore any differences in style, grammar, or punctuation.
+        The submitted answer may either be a subset or superset of the expert's expected answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+        (A) The submitted answer is a subset of the answer the expert's rubric describes and is fully consistent with it.
+        (B) The submitted answer is a superset of the answer the expert's rubric describes and is fully consistent with it.
+        (C) The submitted answer contains all the same details of the answer the expert's rubric describes.
+        (D) There is a disagreement between the submitted answer and the expert's rubric.
         (E) The answers differ, but these differences don't matter from the perspective of factuality.
       `,
 		schema: z.object({
@@ -49,7 +53,7 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
 	 */
 	const scores = {
 		A: 0.4,
-		B: 0.6,
+		B: 1,
 		C: 1,
 		D: 0,
 		E: 1,
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml