Skip to content

Commit 8a38d5e

Browse files
committed
Formatting and string fixes
1 parent f4dc8be commit 8a38d5e

File tree

8 files changed

+169
-161
lines changed

8 files changed

+169
-161
lines changed

.github/workflows/evals.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test and check
1+
name: Evals
22
on:
33
push:
44

@@ -7,7 +7,7 @@ jobs:
77
runs-on: ubuntu-24.04
88
strategy:
99
matrix:
10-
node-version: [20, 22]
10+
node-version: [22]
1111
steps:
1212
- uses: actions/checkout@v4
1313
- name: Install pnpm
Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
1-
import { describeEval } from "vitest-evals"
2-
import { eachModel } from "@repo/eval-tools/src/test-models"
3-
import { checkFactuality } from "@repo/eval-tools/src/scorers"
4-
import { ToolExecutionOptions, ToolSet, generateText, tool } from "ai"
5-
import { MCPClientManager } from "agents/mcp/client"
6-
import { runTask } from "./utils"
1+
import { MCPClientManager } from 'agents/mcp/client'
2+
import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
3+
import { describeEval } from 'vitest-evals'
74

8-
eachModel("$modelName", ({ model }) => {
9-
describeEval("Runs container initialize", {
10-
data: async () => [
11-
{
12-
input: "create and ping a container",
13-
expected: "The container_initialize tool was called and then the container_ping tool was called"
14-
}
15-
],
16-
task: async (input) => {
17-
return await runTask(model, input)
18-
},
19-
scorers: [checkFactuality],
20-
threshold: 1
21-
})
22-
})
5+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
6+
import { eachModel } from '@repo/eval-tools/src/test-models'
7+
8+
import { runTask } from './utils'
9+
10+
eachModel('$modelName', ({ model }) => {
11+
describeEval('Runs container initialize', {
12+
data: async () => [
13+
{
14+
input: 'create and ping a container',
15+
expected:
16+
'The container_initialize tool was called and then the container_ping tool was called',
17+
},
18+
],
19+
task: async (input) => {
20+
return await runTask(model, input)
21+
},
22+
scorers: [checkFactuality],
23+
threshold: 1,
24+
})
25+
})
Lines changed: 48 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,58 @@
1-
import { MCPClientManager } from "agents/mcp/client"
2-
import { LanguageModelV1, ToolSet, streamText, tool } from "ai"
3-
import { jsonSchemaToZod, type JsonSchemaObject } from "@n8n/json-schema-to-zod";
1+
import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
2+
import { MCPClientManager } from 'agents/mcp/client'
3+
import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
44

5-
export async function runTask(model: LanguageModelV1, input: string) {
6-
const clientManager = new MCPClientManager("test-client", "0.0.0")
7-
await clientManager.connect("http://localhost:8787/sse")
5+
import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
86

9-
const tools = clientManager.listTools()
10-
const toolSet: ToolSet = tools.reduce((acc, v) => {
11-
acc[v.name] = tool({
12-
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
13-
description: v.description,
14-
execute: async (args, opts) => {
15-
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
16-
console.log(res.toolResult)
17-
return res.content
18-
},
19-
})
20-
return acc
21-
}, {} as ToolSet)
7+
export async function runTask(model: LanguageModelV1, input: string) {
8+
const clientManager = new MCPClientManager('test-client', '0.0.0')
9+
await clientManager.connect('http://localhost:8787/sse')
2210

23-
const res = streamText({
24-
model,
25-
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
26-
tools: toolSet,
27-
prompt: input,
28-
maxRetries: 1,
29-
maxSteps: 10,
30-
})
11+
const tools = clientManager.listTools()
12+
const toolSet: ToolSet = tools.reduce((acc, v) => {
13+
acc[v.name] = tool({
14+
parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
15+
description: v.description,
16+
execute: async (args, opts) => {
17+
const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
18+
console.log(res.toolResult)
19+
return res.content
20+
},
21+
})
22+
return acc
23+
}, {} as ToolSet)
3124

32-
for await (const part of res.fullStream) {
25+
const res = streamText({
26+
model,
27+
system:
28+
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
29+
tools: toolSet,
30+
prompt: input,
31+
maxRetries: 1,
32+
maxSteps: 10,
33+
})
3334

34-
}
35+
for await (const part of res.fullStream) {
36+
}
3537

36-
// convert into an LLM readable result so our factuality checker can validate tool calls
37-
let messagesWithTools = ""
38-
const messages = (await res.response).messages
39-
for (const message of messages) {
40-
console.log(message.content)
41-
for (const messagePart of message.content) {
42-
if (typeof messagePart === "string") {
43-
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
44-
} else if (messagePart.type === "tool-call") {
45-
messagesWithTools += `<message_content type=${messagePart.type}>
38+
// convert into an LLM readable result so our factuality checker can validate tool calls
39+
let messagesWithTools = ''
40+
const messages = (await res.response).messages
41+
for (const message of messages) {
42+
console.log(message.content)
43+
for (const messagePart of message.content) {
44+
if (typeof messagePart === 'string') {
45+
messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
46+
} else if (messagePart.type === 'tool-call') {
47+
messagesWithTools += `<message_content type=${messagePart.type}>
4648
<tool_name>${messagePart.toolName}</tool_name>
4749
<tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
4850
</message_content>`
49-
} else if (messagePart.type === "text") {
50-
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
51-
}
52-
}
53-
}
51+
} else if (messagePart.type === 'text') {
52+
messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
53+
}
54+
}
55+
}
5456

55-
return messagesWithTools
56-
}
57+
return messagesWithTools
58+
}

apps/sandbox-container/server/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ export { ContainerManager, ContainerMcpAgent }
1616
export type Env = {
1717
CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
1818
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
19-
ENVIRONMENT: 'dev' | 'prod',
20-
CLOUDFLARE_CLIENT_ID: string,
21-
CLOUDFLARE_CLIENT_SECRET: string,
19+
ENVIRONMENT: 'dev' | 'prod'
20+
CLOUDFLARE_CLIENT_ID: string
21+
CLOUDFLARE_CLIENT_SECRET: string
2222
}
2323

2424
// Context from the auth process, encrypted & stored in the auth token
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
import { defineWorkersConfig } from "@cloudflare/vitest-pool-workers/config";
1+
import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
22

33
export default defineWorkersConfig({
4-
test: {
5-
include: ["**/*.eval.?(c|m)[jt]s?(x)"],
6-
poolOptions: {
7-
workers: {
8-
isolatedStorage: true,
9-
wrangler: { configPath: "./wrangler.jsonc" },
10-
},
11-
},
12-
},
13-
});
4+
test: {
5+
include: ['**/*.eval.?(c|m)[jt]s?(x)'],
6+
poolOptions: {
7+
workers: {
8+
isolatedStorage: true,
9+
wrangler: { configPath: './wrangler.jsonc' },
10+
},
11+
},
12+
},
13+
})

packages/eval-tools/src/scorers.ts

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
1-
import { generateObject } from "ai";
2-
import { z } from "zod";
3-
import type { ScoreFn } from "vitest-evals";
4-
import { factualityModel } from "./test-models";
1+
import { generateObject } from 'ai'
2+
import { z } from 'zod'
3+
4+
import { factualityModel } from './test-models'
5+
6+
import type { ScoreFn } from 'vitest-evals'
57

68
/**
79
* Checks the factuality of a submission, using
810
* OpenAI's GPT-4o model.
911
*/
1012
export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
11-
const { model } = factualityModel;
12-
const { object } = await generateObject({
13-
model,
14-
/**
15-
* Prompt taken from autoevals:
16-
*
17-
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
18-
*/
19-
prompt: `
13+
const { model } = factualityModel
14+
const { object } = await generateObject({
15+
model,
16+
/**
17+
* Prompt taken from autoevals:
18+
*
19+
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
20+
*/
21+
prompt: `
2022
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
2123
[BEGIN DATA]
2224
************
@@ -36,29 +38,27 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
3638
(D) There is a disagreement between the submitted answer and the expert answer.
3739
(E) The answers differ, but these differences don't matter from the perspective of factuality.
3840
`,
39-
schema: z.object({
40-
answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
41-
rationale: z
42-
.string()
43-
.describe("Why you chose this answer. Be very detailed."),
44-
}),
45-
});
41+
schema: z.object({
42+
answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
43+
rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
44+
}),
45+
})
4646

47-
/**
48-
* LLM's are well documented at being poor at generating
49-
*/
50-
const scores = {
51-
A: 0.4,
52-
B: 0.6,
53-
C: 1,
54-
D: 0,
55-
E: 1,
56-
};
47+
/**
48+
* LLM's are well documented at being poor at generating
49+
*/
50+
const scores = {
51+
A: 0.4,
52+
B: 0.6,
53+
C: 1,
54+
D: 0,
55+
E: 1,
56+
}
5757

58-
return {
59-
score: scores[object.answer],
60-
metadata: {
61-
rationale: object.rationale,
62-
},
63-
};
64-
};
58+
return {
59+
score: scores[object.answer],
60+
metadata: {
61+
rationale: object.rationale,
62+
},
63+
}
64+
}
Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,50 @@
1-
import { createOpenAI } from "@ai-sdk/openai";
2-
import { describe } from "vitest";
3-
import { createWorkersAI } from "workers-ai-provider"
4-
import { env } from "cloudflare:test"
1+
import { createOpenAI } from '@ai-sdk/openai'
2+
import { env } from 'cloudflare:test'
3+
import { describe } from 'vitest'
4+
import { createWorkersAI } from 'workers-ai-provider'
55

6-
export const factualityModel = getOpenAiModel("gpt-4o")
6+
export const factualityModel = getOpenAiModel('gpt-4o')
77

88
type value2key<T, V> = {
9-
[K in keyof T]: T[K] extends V ? K : never;
10-
}[keyof T];
11-
type AiTextGenerationModels = Exclude<value2key<AiModels, BaseAiTextGeneration>, value2key<AiModels, BaseAiTextToImage>>;
9+
[K in keyof T]: T[K] extends V ? K : never
10+
}[keyof T]
11+
type AiTextGenerationModels = Exclude<
12+
value2key<AiModels, BaseAiTextGeneration>,
13+
value2key<AiModels, BaseAiTextToImage>
14+
>
1215

1316
function getOpenAiModel(modelName: string) {
14-
if (!env.OPENAI_API_KEY) {
15-
throw new Error("No API token set!");
16-
}
17-
const ai = createOpenAI({
18-
apiKey: env.OPENAI_API_KEY,
19-
});
17+
if (!env.OPENAI_API_KEY) {
18+
throw new Error('No API token set!')
19+
}
20+
const ai = createOpenAI({
21+
apiKey: env.OPENAI_API_KEY,
22+
})
2023

21-
const model = ai(modelName);
24+
const model = ai(modelName)
2225

23-
return { modelName, model, ai };
26+
return { modelName, model, ai }
2427
}
2528

2629
function getWorkersAiModel(modelName: AiTextGenerationModels) {
27-
if (!env.AI) {
28-
throw new Error("No AI binding provided!")
29-
}
30+
if (!env.AI) {
31+
throw new Error('No AI binding provided!')
32+
}
3033

31-
const ai = createWorkersAI({ binding: env.AI });
34+
const ai = createWorkersAI({ binding: env.AI })
3235

33-
const model = ai(modelName)
34-
return { modelName, model, ai }
36+
const model = ai(modelName)
37+
return { modelName, model, ai }
3538
}
3639

3740
export const eachModel = describe.each([
38-
getOpenAiModel("gpt-4o"),
39-
getOpenAiModel("gpt-4o-mini"),
40-
41-
// llama 3 is somewhat inconsistent
42-
//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
43-
// Currently llama 4 is having issues with tool calling
44-
//getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")
45-
46-
// TODO: add Claude, Gemini, new OpenAI models via AI gateway
47-
]);
41+
getOpenAiModel('gpt-4o'),
42+
getOpenAiModel('gpt-4o-mini'),
43+
44+
// llama 3 is somewhat inconsistent
45+
//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
46+
// Currently llama 4 is having issues with tool calling
47+
//getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")
48+
49+
// TODO: add Claude, Gemini, new OpenAI models via AI gateway
50+
])

0 commit comments

Comments
 (0)