Skip to content

Commit a99a669

Browse files
authored
Merge pull request #66 from cloudflare/csparks/add-container-tool-evals
Add container tool evals for file write, delete, and container exec
2 parents 4717f26 + d62641c commit a99a669

File tree

5 files changed

+173
-59
lines changed

5 files changed

+173
-59
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { assert, expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
import { z } from 'zod'
4+
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { initializeClient, runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs a python file in a container', {
12+
data: async () => [
13+
{
14+
input: 'Create a hello world python script and run it',
15+
expected: `The container_file_write tool was called, containing a file ending in .py.\
16+
Then the container_file_exec tool was called with python or python3 as one of the arguments`,
17+
},
18+
],
19+
task: async (input) => {
20+
const client = await initializeClient()
21+
const { promptOutput, toolCalls } = await runTask(client, model, input)
22+
23+
expect(toolCalls).toEqual(
24+
expect.arrayContaining([
25+
expect.objectContaining({
26+
type: 'tool-call',
27+
toolName: 'container_exec',
28+
args: {
29+
args: expect.objectContaining({
30+
args: expect.stringContaining('python'),
31+
}),
32+
},
33+
}),
34+
])
35+
)
36+
37+
expect(toolCalls).toEqual(
38+
expect.arrayContaining([
39+
expect.objectContaining({
40+
type: 'tool-call',
41+
toolName: 'container_file_write',
42+
args: {
43+
args: expect.objectContaining({
44+
path: expect.stringContaining('.py'),
45+
}),
46+
},
47+
}),
48+
])
49+
)
50+
51+
return promptOutput
52+
},
53+
scorers: [checkFactuality],
54+
threshold: 1,
55+
})
56+
})

apps/sandbox-container/evals/file_write.eval.ts

Lines changed: 0 additions & 55 deletions
This file was deleted.
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import { assert, expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
import { z } from 'zod'
4+
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { initializeClient, runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs container file write', {
12+
data: async () => [
13+
{
14+
input: 'write a file named test.txt containing the text "asdf"',
15+
expected: 'The container_file_write tool was called and the file\'s content is "asdf"',
16+
},
17+
],
18+
task: async (input) => {
19+
const client = await initializeClient()
20+
const { promptOutput } = await runTask(client, model, input)
21+
const fileRead = client.listTools().find((tool) => {
22+
if (tool.name === 'container_file_read') {
23+
return tool
24+
}
25+
})
26+
27+
assert(fileRead !== undefined)
28+
const result = await client.callTool(
29+
{
30+
...fileRead,
31+
arguments: {
32+
args: { path: 'file://test.txt' },
33+
},
34+
},
35+
z.any() as any,
36+
{}
37+
)
38+
39+
expect(result.content).toStrictEqual([
40+
{
41+
type: 'resource',
42+
resource: {
43+
uri: 'file://test.txt',
44+
mimeType: 'text/plain',
45+
text: 'asdf',
46+
},
47+
},
48+
])
49+
50+
return promptOutput
51+
},
52+
scorers: [checkFactuality],
53+
threshold: 1,
54+
})
55+
56+
describeEval('Runs container file delete', {
57+
data: async () => [
58+
{
59+
input: 'write a file named test.txt, then delete it',
60+
expected:
61+
'The container_file_write tool was called and then the container_file_delete tool was called with the same parameters',
62+
},
63+
],
64+
task: async (input) => {
65+
const client = await initializeClient()
66+
const { promptOutput, toolCalls } = await runTask(client, model, input)
67+
68+
const toolArgs = toolCalls.find((tool) => {
69+
return tool.toolName === 'container_file_write' ? tool : undefined
70+
})?.args as { args: { path: string } } | undefined
71+
72+
assert(toolArgs !== undefined)
73+
expect(toolCalls).toEqual(
74+
expect.arrayContaining([
75+
expect.objectContaining({
76+
type: 'tool-call',
77+
toolName: 'container_file_write',
78+
args: {
79+
args: expect.objectContaining({
80+
path: toolArgs.args.path,
81+
}),
82+
},
83+
}),
84+
])
85+
)
86+
87+
expect(toolCalls).toEqual(
88+
expect.arrayContaining([
89+
expect.objectContaining({
90+
type: 'tool-call',
91+
toolName: 'container_file_delete',
92+
args: {
93+
args: expect.objectContaining({
94+
path: toolArgs.args.path,
95+
}),
96+
},
97+
}),
98+
])
99+
)
100+
101+
return promptOutput
102+
},
103+
scorers: [checkFactuality],
104+
threshold: 1,
105+
})
106+
})

apps/sandbox-container/evals/initialize.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ eachModel('$modelName', ({ model }) => {
1616
],
1717
task: async (input) => {
1818
const client = await initializeClient()
19-
return await runTask(client, model, input)
19+
const { promptOutput } = await runTask(client, model, input)
20+
return promptOutput
2021
},
2122
scorers: [checkFactuality],
2223
threshold: 1,

apps/sandbox-container/evals/utils.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
22
import { MCPClientManager } from 'agents/mcp/client'
3-
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
3+
import { LanguageModelV1, streamText, StreamTextResult, tool, ToolCallPart, ToolSet } from 'ai'
44
import { z } from 'zod'
55

66
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
@@ -15,7 +15,11 @@ export async function runTask(
1515
clientManager: MCPClientManager,
1616
model: LanguageModelV1,
1717
input: string
18-
) {
18+
): Promise<{
19+
promptOutput: string
20+
fullResult: StreamTextResult<ToolSet, never>
21+
toolCalls: ToolCallPart[]
22+
}> {
1923
const tools = clientManager.listTools()
2024
const toolSet: ToolSet = tools.reduce((acc, v) => {
2125
acc[v.name] = tool({
@@ -57,6 +61,7 @@ export async function runTask(
5761

5862
// convert into an LLM readable result so our factuality checker can validate tool calls
5963
let messagesWithTools = ''
64+
let toolCalls: ToolCallPart[] = []
6065
const messages = (await res.response).messages
6166
for (const message of messages) {
6267
console.log(message.content)
@@ -68,11 +73,12 @@ export async function runTask(
6873
<tool_name>${messagePart.toolName}</tool_name>
6974
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
7075
</message_content>`
76+
toolCalls.push(messagePart)
7177
} else if (messagePart.type === 'text') {
7278
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
7379
}
7480
}
7581
}
7682

77-
return messagesWithTools
83+
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
7884
}

0 commit comments

Comments
 (0)