Merge pull request #66 from cloudflare/csparks/add-container-tool-evals

cmsparks · web-flow · commit a99a66996fa1 · 2025-04-23T11:19:04.000-05:00
Add container tool evals for file write, delete, and container exec
diff --git a/apps/sandbox-container/evals/exec.eval.ts b/apps/sandbox-container/evals/exec.eval.ts
@@ -0,0 +1,56 @@
+import { assert, expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+import { z } from 'zod'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs a python file in a container', {
+		data: async () => [
+			{
+				input: 'Create a hello world python script and run it',
+				expected: `The container_file_write tool was called, containing a file ending in .py.\
+				Then the container_file_exec tool was called with python or python3 as one of the arguments`,
+			},
+		],
+		task: async (input) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			expect(toolCalls).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({
+						type: 'tool-call',
+						toolName: 'container_exec',
+						args: {
+							args: expect.objectContaining({
+								args: expect.stringContaining('python'),
+							}),
+						},
+					}),
+				])
+			)
+
+			expect(toolCalls).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({
+						type: 'tool-call',
+						toolName: 'container_file_write',
+						args: {
+							args: expect.objectContaining({
+								path: expect.stringContaining('.py'),
+							}),
+						},
+					}),
+				])
+			)
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
diff --git a/apps/sandbox-container/evals/file_write.eval.ts b/apps/sandbox-container/evals/file_write.eval.ts
diff --git a/apps/sandbox-container/evals/files.eval.ts b/apps/sandbox-container/evals/files.eval.ts
@@ -0,0 +1,106 @@
+import { assert, expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+import { z } from 'zod'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs container file write', {
+		data: async () => [
+			{
+				input: 'write a file named test.txt containing the text "asdf"',
+				expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
+			},
+		],
+		task: async (input) => {
+			const client = await initializeClient()
+			const { promptOutput } = await runTask(client, model, input)
+			const fileRead = client.listTools().find((tool) => {
+				if (tool.name === 'container_file_read') {
+					return tool
+				}
+			})
+
+			assert(fileRead !== undefined)
+			const result = await client.callTool(
+				{
+					...fileRead,
+					arguments: {
+						args: { path: 'file://test.txt' },
+					},
+				},
+				z.any() as any,
+				{}
+			)
+
+			expect(result.content).toStrictEqual([
+				{
+					type: 'resource',
+					resource: {
+						uri: 'file://test.txt',
+						mimeType: 'text/plain',
+						text: 'asdf',
+					},
+				},
+			])
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+
+	describeEval('Runs container file delete', {
+		data: async () => [
+			{
+				input: 'write a file named test.txt, then delete it',
+				expected:
+					'The container_file_write tool was called and then the container_file_delete tool was called with the same parameters',
+			},
+		],
+		task: async (input) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			const toolArgs = toolCalls.find((tool) => {
+				return tool.toolName === 'container_file_write' ? tool : undefined
+			})?.args as { args: { path: string } } | undefined
+
+			assert(toolArgs !== undefined)
+			expect(toolCalls).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({
+						type: 'tool-call',
+						toolName: 'container_file_write',
+						args: {
+							args: expect.objectContaining({
+								path: toolArgs.args.path,
+							}),
+						},
+					}),
+				])
+			)
+
+			expect(toolCalls).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({
+						type: 'tool-call',
+						toolName: 'container_file_delete',
+						args: {
+							args: expect.objectContaining({
+								path: toolArgs.args.path,
+							}),
+						},
+					}),
+				])
+			)
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
diff --git a/apps/sandbox-container/evals/initialize.eval.ts b/apps/sandbox-container/evals/initialize.eval.ts
@@ -16,7 +16,8 @@ eachModel('$modelName', ({ model }) => {
 		],
 		task: async (input) => {
 			const client = await initializeClient()
-			return await runTask(client, model, input)
+			const { promptOutput } = await runTask(client, model, input)
+			return promptOutput
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
diff --git a/apps/sandbox-container/evals/utils.ts b/apps/sandbox-container/evals/utils.ts
@@ -1,6 +1,6 @@
 import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
 import { MCPClientManager } from 'agents/mcp/client'
-import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
+import { LanguageModelV1, streamText, StreamTextResult, tool, ToolCallPart, ToolSet } from 'ai'
 import { z } from 'zod'
 
 import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
@@ -15,7 +15,11 @@ export async function runTask(
 	clientManager: MCPClientManager,
 	model: LanguageModelV1,
 	input: string
-) {
+): Promise<{
+	promptOutput: string
+	fullResult: StreamTextResult<ToolSet, never>
+	toolCalls: ToolCallPart[]
+}> {
 	const tools = clientManager.listTools()
 	const toolSet: ToolSet = tools.reduce((acc, v) => {
 		acc[v.name] = tool({
@@ -57,6 +61,7 @@ export async function runTask(
 
 	// convert into an LLM readable result so our factuality checker can validate tool calls
 	let messagesWithTools = ''
+	let toolCalls: ToolCallPart[] = []
 	const messages = (await res.response).messages
 	for (const message of messages) {
 		console.log(message.content)
@@ -68,11 +73,12 @@ export async function runTask(
     <tool_name>${messagePart.toolName}</tool_name>
     <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
 </message_content>`
+				toolCalls.push(messagePart)
 			} else if (messagePart.type === 'text') {
 				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
 			}
 		}
 	}
 
-	return messagesWithTools
+	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
 }