Skip to content

Commit 30d1024

Browse files
committed
Move runTask to common evals package
1 parent fbcfaeb commit 30d1024

File tree

11 files changed

+100
-171
lines changed

11 files changed

+100
-171
lines changed

apps/sandbox-container/evals/exec.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import { expect } from 'vitest'
22
import { describeEval } from 'vitest-evals'
33

4+
import { runTask } from '@repo/eval-tools/src/runTask'
45
import { checkFactuality } from '@repo/eval-tools/src/scorers'
56
import { eachModel } from '@repo/eval-tools/src/test-models'
67

7-
import { initializeClient, runTask } from './utils'
8+
import { initializeClient } from './utils'
89

910
eachModel('$modelName', ({ model }) => {
1011
describeEval('Runs a python file in a container', {

apps/sandbox-container/evals/files.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ import { assert, expect } from 'vitest'
22
import { describeEval } from 'vitest-evals'
33
import { z } from 'zod'
44

5+
import { runTask } from '@repo/eval-tools/src/runTask'
56
import { checkFactuality } from '@repo/eval-tools/src/scorers'
67
import { eachModel } from '@repo/eval-tools/src/test-models'
78

8-
import { initializeClient, runTask } from './utils'
9+
import { initializeClient } from './utils'
910

1011
eachModel('$modelName', ({ model }) => {
1112
describeEval('Runs container file write', {

apps/sandbox-container/evals/initialize.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import { describeEval } from 'vitest-evals'
22

3+
import { runTask } from '@repo/eval-tools/src/runTask'
34
import { checkFactuality } from '@repo/eval-tools/src/scorers'
45
import { eachModel } from '@repo/eval-tools/src/test-models'
56

6-
import { initializeClient, runTask } from './utils'
7+
import { initializeClient } from './utils'
78

89
eachModel('$modelName', ({ model }) => {
910
describeEval('Runs container initialize', {

apps/sandbox-container/evals/utils.ts

Lines changed: 0 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,7 @@
11
import { MCPClientManager } from 'agents/mcp/client'
2-
import { jsonSchema, streamText, tool } from 'ai'
3-
import { z } from 'zod'
4-
5-
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
62

73
export async function initializeClient(): Promise<MCPClientManager> {
84
const clientManager = new MCPClientManager('test-client', '0.0.0')
95
await clientManager.connect('http://localhost:8976/sse')
106
return clientManager
117
}
12-
13-
export async function runTask(
14-
clientManager: MCPClientManager,
15-
model: LanguageModelV1,
16-
input: string
17-
): Promise<{
18-
promptOutput: string
19-
fullResult: StreamTextResult<ToolSet, never>
20-
toolCalls: ToolCallPart[]
21-
}> {
22-
const tools = clientManager.listTools()
23-
const toolSet: ToolSet = tools.reduce((acc, v) => {
24-
if (!v.inputSchema.properties) {
25-
v.inputSchema.properties = {}
26-
}
27-
28-
acc[v.name] = tool({
29-
parameters: jsonSchema(v.inputSchema as any),
30-
description: v.description,
31-
execute: async (args: any, opts) => {
32-
try {
33-
const res = await clientManager.callTool(
34-
{
35-
...v,
36-
arguments: { ...args },
37-
},
38-
z.any() as any,
39-
{ signal: opts.abortSignal }
40-
)
41-
return res.content
42-
} catch (e) {
43-
console.log('Error calling tool')
44-
console.log(e)
45-
return e
46-
}
47-
},
48-
})
49-
return acc
50-
}, {} as ToolSet)
51-
52-
const res = streamText({
53-
model,
54-
system:
55-
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
56-
tools: toolSet,
57-
prompt: input,
58-
maxRetries: 1,
59-
maxSteps: 10,
60-
})
61-
62-
// consume the stream
63-
// eslint-disable-next-line no-empty
64-
for await (const _ of res.fullStream) {
65-
}
66-
67-
// convert into an LLM readable result so our factuality checker can validate tool calls
68-
let messagesWithTools = ''
69-
const toolCalls: ToolCallPart[] = []
70-
const messages = (await res.response).messages
71-
for (const message of messages) {
72-
console.log(message.content)
73-
for (const messagePart of message.content) {
74-
if (typeof messagePart === 'string') {
75-
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
76-
} else if (messagePart.type === 'tool-call') {
77-
messagesWithTools += `<message_content type=${messagePart.type}>
78-
<tool_name>${messagePart.toolName}</tool_name>
79-
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
80-
</message_content>`
81-
toolCalls.push(messagePart)
82-
} else if (messagePart.type === 'text') {
83-
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
84-
}
85-
}
86-
}
87-
88-
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
89-
}

apps/workers-bindings/evals/accounts.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import { expect } from 'vitest'
22
import { describeEval } from 'vitest-evals'
33

4+
import { runTask } from '@repo/eval-tools/src/runTask'
45
import { checkFactuality } from '@repo/eval-tools/src/scorers'
56
import { eachModel } from '@repo/eval-tools/src/test-models'
67

7-
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
import { initializeClient } from './utils' // Assuming utils.ts will exist here
89

910
// Define a mock account ID for testing
1011
const MOCK_ACCOUNT_ID = 'mock-account-12345'

apps/workers-bindings/evals/hyperdrive.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import { expect } from 'vitest'
22
import { describeEval } from 'vitest-evals'
33

4+
import { runTask } from '@repo/eval-tools/src/runTask'
45
import { checkFactuality } from '@repo/eval-tools/src/scorers'
56
import { eachModel } from '@repo/eval-tools/src/test-models'
67
import { HYPERDRIVE_TOOLS } from '@repo/mcp-common/src/tools/hyperdrive'
78

8-
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
9+
import { initializeClient } from './utils' // Assuming utils.ts will exist here
910

1011
// TODO: Add test for creating hyperdrive config with the following params once we can securely pass parameters to the tool. See: https://github.com/modelcontextprotocol/modelcontextprotocol/pull/382
1112
// const HYPERDRIVE_NAME = 'neon-test-hyperdrive'

apps/workers-bindings/evals/kv_namespaces.eval.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import { expect } from 'vitest'
22
import { describeEval } from 'vitest-evals'
33

4+
import { runTask } from '@repo/eval-tools/src/runTask'
45
import { checkFactuality } from '@repo/eval-tools/src/scorers'
56
import { eachModel } from '@repo/eval-tools/src/test-models'
67
import { KV_NAMESPACE_TOOLS } from '@repo/mcp-common/src/tools/kv_namespace'
78

8-
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
9+
import { initializeClient } from './utils' // Assuming utils.ts will exist here
910

1011
eachModel('$modelName', ({ model }) => {
1112
describeEval('Create Cloudflare KV Namespace', {

apps/workers-bindings/evals/utils.ts

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,7 @@
11
import { MCPClientManager } from 'agents/mcp/client'
2-
import { jsonSchema, streamText, tool } from 'ai'
3-
import { z } from 'zod'
4-
5-
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
62

73
export async function initializeClient(): Promise<MCPClientManager> {
84
const clientManager = new MCPClientManager('test-client', '0.0.0')
95
await clientManager.connect('http://localhost:8977/sse')
106
return clientManager
117
}
12-
13-
export async function runTask(
14-
clientManager: MCPClientManager,
15-
model: LanguageModelV1,
16-
input: string
17-
): Promise<{
18-
promptOutput: string
19-
fullResult: StreamTextResult<ToolSet, never>
20-
toolCalls: ToolCallPart[]
21-
}> {
22-
const tools = clientManager.listTools()
23-
const toolSet: ToolSet = tools.reduce((acc, v) => {
24-
if (!v.inputSchema.properties) {
25-
v.inputSchema.properties = {}
26-
}
27-
28-
acc[v.name] = tool({
29-
parameters: jsonSchema(v.inputSchema as any),
30-
description: v.description,
31-
execute: async (args: any, opts) => {
32-
try {
33-
const res = await clientManager.callTool(
34-
{
35-
...v,
36-
arguments: { ...args },
37-
},
38-
z.any() as any,
39-
{ signal: opts.abortSignal }
40-
)
41-
return res.content
42-
} catch (e) {
43-
console.log('Error calling tool')
44-
console.log(e)
45-
return e
46-
}
47-
},
48-
})
49-
return acc
50-
}, {} as ToolSet)
51-
52-
const res = streamText({
53-
model,
54-
system:
55-
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
56-
tools: toolSet,
57-
prompt: input,
58-
maxRetries: 1,
59-
maxSteps: 10,
60-
})
61-
62-
// we need to consume the fill stream, so this is empty
63-
// eslint-disable-next-line no-empty
64-
for await (const _ of res.fullStream) {
65-
}
66-
67-
// convert into an LLM readable result so our factuality checker can validate tool calls
68-
let messagesWithTools = ''
69-
const toolCalls: ToolCallPart[] = []
70-
const response = await res.response
71-
const messages = response.messages
72-
73-
for (const message of messages) {
74-
for (const messagePart of message.content) {
75-
if (typeof messagePart === 'string') {
76-
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
77-
} else if (messagePart.type === 'tool-call') {
78-
messagesWithTools += `<message_content type=${messagePart.type}>
79-
<tool_name>${messagePart.toolName}</tool_name>
80-
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
81-
</message_content>`
82-
toolCalls.push(messagePart)
83-
} else if (messagePart.type === 'text') {
84-
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
85-
}
86-
}
87-
}
88-
89-
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
90-
}

packages/eval-tools/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"dependencies": {
1414
"@ai-sdk/openai": "1.3.20",
1515
"@cloudflare/vitest-pool-workers": "0.8.14",
16+
"agents": "0.0.67",
1617
"ai": "4.3.10",
1718
"workers-ai-provider": "0.3.0",
1819
"wrangler": "4.10.0",

packages/eval-tools/src/runTask.ts

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import { type MCPClientManager } from 'agents/mcp/client'
2+
import { jsonSchema, streamText, tool } from 'ai'
3+
import { z } from 'zod'
4+
5+
import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
6+
7+
export async function runTask(
8+
clientManager: MCPClientManager,
9+
model: LanguageModelV1,
10+
input: string
11+
): Promise<{
12+
promptOutput: string
13+
fullResult: StreamTextResult<ToolSet, never>
14+
toolCalls: ToolCallPart[]
15+
}> {
16+
const tools = clientManager.listTools()
17+
const toolSet: ToolSet = tools.reduce((acc, v) => {
18+
if (!v.inputSchema.properties) {
19+
v.inputSchema.properties = {}
20+
}
21+
22+
acc[v.name] = tool({
23+
parameters: jsonSchema(v.inputSchema as any),
24+
description: v.description,
25+
execute: async (args: any, opts) => {
26+
try {
27+
const res = await clientManager.callTool(
28+
{
29+
...v,
30+
arguments: { ...args },
31+
},
32+
z.any() as any,
33+
{ signal: opts.abortSignal }
34+
)
35+
return res.content
36+
} catch (e) {
37+
console.log('Error calling tool')
38+
console.log(e)
39+
return e
40+
}
41+
},
42+
})
43+
return acc
44+
}, {} as ToolSet)
45+
46+
const res = streamText({
47+
model,
48+
system:
49+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
50+
tools: toolSet,
51+
prompt: input,
52+
maxRetries: 1,
53+
maxSteps: 10,
54+
})
55+
56+
// we need to consume the fill stream, so this is empty
57+
// eslint-disable-next-line no-empty
58+
for await (const _ of res.fullStream) {
59+
}
60+
61+
// convert into an LLM readable result so our factuality checker can validate tool calls
62+
let messagesWithTools = ''
63+
const toolCalls: ToolCallPart[] = []
64+
const response = await res.response
65+
const messages = response.messages
66+
67+
for (const message of messages) {
68+
for (const messagePart of message.content) {
69+
if (typeof messagePart === 'string') {
70+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
71+
} else if (messagePart.type === 'tool-call') {
72+
messagesWithTools += `<message_content type=${messagePart.type}>
73+
<tool_name>${messagePart.toolName}</tool_name>
74+
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
75+
</message_content>`
76+
toolCalls.push(messagePart)
77+
} else if (messagePart.type === 'text') {
78+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
79+
}
80+
}
81+
}
82+
83+
return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
84+
}

0 commit comments

Comments
 (0)