Formatting and string fixes

cmsparks · cmsparks · commit 8a38d5e6d6cc · 2025-04-16T13:14:31.000-05:00
diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
@@ -1,4 +1,4 @@
-name: Test and check
+name: Evals
 on:
   push:
 
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-24.04
     strategy:
       matrix:
-        node-version: [20, 22]
+        node-version: [22]
     steps:
       - uses: actions/checkout@v4
       - name: Install pnpm
diff --git a/apps/sandbox-container/evals/initialize.eval.ts b/apps/sandbox-container/evals/initialize.eval.ts
@@ -1,22 +1,25 @@
-import { describeEval } from "vitest-evals"
-import { eachModel } from "@repo/eval-tools/src/test-models"
-import { checkFactuality } from "@repo/eval-tools/src/scorers"
-import { ToolExecutionOptions, ToolSet, generateText, tool } from "ai"
-import { MCPClientManager } from "agents/mcp/client"
-import { runTask } from "./utils"
+import { MCPClientManager } from 'agents/mcp/client'
+import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
+import { describeEval } from 'vitest-evals'
 
-eachModel("$modelName", ({ model }) => {
-    describeEval("Runs container initialize", {
-        data: async () => [
-            { 
-                input: "create and ping a container", 
-                expected: "The container_initialize tool was called and then the container_ping tool was called"
-            }
-        ],
-        task: async (input) => {
-            return await runTask(model, input)
-        },
-        scorers: [checkFactuality],
-        threshold: 1
-    })
-})
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs container initialize', {
+		data: async () => [
+			{
+				input: 'create and ping a container',
+				expected:
+					'The container_initialize tool was called and then the container_ping tool was called',
+			},
+		],
+		task: async (input) => {
+			return await runTask(model, input)
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
diff --git a/apps/sandbox-container/evals/utils.ts b/apps/sandbox-container/evals/utils.ts
@@ -1,56 +1,58 @@
-import { MCPClientManager } from "agents/mcp/client"
-import { LanguageModelV1, ToolSet, streamText, tool } from "ai"
-import { jsonSchemaToZod, type JsonSchemaObject } from "@n8n/json-schema-to-zod";
+import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
+import { MCPClientManager } from 'agents/mcp/client'
+import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
 
-export async function runTask(model: LanguageModelV1, input: string) {
-    const clientManager = new MCPClientManager("test-client", "0.0.0")
-    await clientManager.connect("http://localhost:8787/sse")
+import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
 
-    const tools = clientManager.listTools()
-    const toolSet: ToolSet = tools.reduce((acc, v) => {
-        acc[v.name] = tool({
-            parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
-            description: v.description,
-            execute: async (args, opts) => {
-                const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
-                console.log(res.toolResult)
-                return res.content
-            },
-        })
-        return acc
-    }, {} as ToolSet)
+export async function runTask(model: LanguageModelV1, input: string) {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8787/sse')
 
-    const res = streamText({
-        model,
-        system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
-        tools: toolSet,
-        prompt: input,
-        maxRetries: 1,
-        maxSteps: 10,
-    })
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		acc[v.name] = tool({
+			parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
+			description: v.description,
+			execute: async (args, opts) => {
+				const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
+				console.log(res.toolResult)
+				return res.content
+			},
+		})
+		return acc
+	}, {} as ToolSet)
 
-    for await (const part of res.fullStream) {
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
 
-    }
+	for await (const part of res.fullStream) {
+	}
 
-    // convert into an LLM readable result so our factuality checker can validate tool calls
-    let messagesWithTools = ""
-    const messages = (await res.response).messages
-    for (const message of messages) {
-        console.log(message.content)
-        for (const messagePart of message.content) {
-            if (typeof messagePart === "string") {
-                messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
-            } else if (messagePart.type === "tool-call") {
-                messagesWithTools += `<message_content type=${messagePart.type}>
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const messages = (await res.response).messages
+	for (const message of messages) {
+		console.log(message.content)
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
     <tool_name>${messagePart.toolName}</tool_name>
     <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
 </message_content>`
-            } else if (messagePart.type === "text") {
-                messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
-            }
-        }
-    }
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
 
-    return messagesWithTools
-}
+	return messagesWithTools
+}
diff --git a/apps/sandbox-container/server/index.ts b/apps/sandbox-container/server/index.ts
@@ -16,9 +16,9 @@ export { ContainerManager, ContainerMcpAgent }
 export type Env = {
 	CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
 	CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
-	ENVIRONMENT: 'dev' | 'prod',
-	CLOUDFLARE_CLIENT_ID: string,
-	CLOUDFLARE_CLIENT_SECRET: string,
+	ENVIRONMENT: 'dev' | 'prod'
+	CLOUDFLARE_CLIENT_ID: string
+	CLOUDFLARE_CLIENT_SECRET: string
 }
 
 // Context from the auth process, encrypted & stored in the auth token
diff --git a/apps/sandbox-container/vitest.config.evals.ts b/apps/sandbox-container/vitest.config.evals.ts
@@ -1,13 +1,13 @@
-import { defineWorkersConfig } from "@cloudflare/vitest-pool-workers/config";
+import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
 
 export default defineWorkersConfig({
-    test: {
-      include: ["**/*.eval.?(c|m)[jt]s?(x)"],
-      poolOptions: {
-        workers: {
-          isolatedStorage: true,
-          wrangler: { configPath: "./wrangler.jsonc" },
-        },
-      },
-    },
-});
+	test: {
+		include: ['**/*.eval.?(c|m)[jt]s?(x)'],
+		poolOptions: {
+			workers: {
+				isolatedStorage: true,
+				wrangler: { configPath: './wrangler.jsonc' },
+			},
+		},
+	},
+})
diff --git a/packages/eval-tools/src/scorers.ts b/packages/eval-tools/src/scorers.ts
@@ -1,22 +1,24 @@
-import { generateObject } from "ai";
-import { z } from "zod";
-import type { ScoreFn } from "vitest-evals";
-import { factualityModel } from "./test-models";
+import { generateObject } from 'ai'
+import { z } from 'zod'
+
+import { factualityModel } from './test-models'
+
+import type { ScoreFn } from 'vitest-evals'
 
 /**
  * Checks the factuality of a submission, using
  * OpenAI's GPT-4o model.
  */
 export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
-  const { model } = factualityModel;
-  const { object } = await generateObject({
-    model,
-    /**
-     * Prompt taken from autoevals:
-     *
-     * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
-     */
-    prompt: `
+	const { model } = factualityModel
+	const { object } = await generateObject({
+		model,
+		/**
+		 * Prompt taken from autoevals:
+		 *
+		 * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
+		 */
+		prompt: `
         You are comparing a submitted answer to an expert answer on a given question. Here is the data:
         [BEGIN DATA]
         ************
@@ -36,29 +38,27 @@ export const checkFactuality: ScoreFn = async ({ input, expected, output }) => {
         (D) There is a disagreement between the submitted answer and the expert answer.
         (E) The answers differ, but these differences don't matter from the perspective of factuality.
       `,
-    schema: z.object({
-      answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
-      rationale: z
-        .string()
-        .describe("Why you chose this answer. Be very detailed."),
-    }),
-  });
+		schema: z.object({
+			answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
+			rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
+		}),
+	})
 
-  /**
-   * LLM's are well documented at being poor at generating
-   */
-  const scores = {
-    A: 0.4,
-    B: 0.6,
-    C: 1,
-    D: 0,
-    E: 1,
-  };
+	/**
+	 * LLM's are well documented at being poor at generating
+	 */
+	const scores = {
+		A: 0.4,
+		B: 0.6,
+		C: 1,
+		D: 0,
+		E: 1,
+	}
 
-  return {
-    score: scores[object.answer],
-    metadata: {
-      rationale: object.rationale,
-    },
-  };
-};
+	return {
+		score: scores[object.answer],
+		metadata: {
+			rationale: object.rationale,
+		},
+	}
+}
diff --git a/packages/eval-tools/src/test-models.ts b/packages/eval-tools/src/test-models.ts
@@ -1,47 +1,50 @@
-import { createOpenAI } from "@ai-sdk/openai";
-import { describe } from "vitest";
-import { createWorkersAI } from "workers-ai-provider"
-import { env } from "cloudflare:test"
+import { createOpenAI } from '@ai-sdk/openai'
+import { env } from 'cloudflare:test'
+import { describe } from 'vitest'
+import { createWorkersAI } from 'workers-ai-provider'
 
-export const factualityModel = getOpenAiModel("gpt-4o")
+export const factualityModel = getOpenAiModel('gpt-4o')
 
 type value2key<T, V> = {
-    [K in keyof T]: T[K] extends V ? K : never;
-}[keyof T];
-type AiTextGenerationModels = Exclude<value2key<AiModels, BaseAiTextGeneration>, value2key<AiModels, BaseAiTextToImage>>;
+	[K in keyof T]: T[K] extends V ? K : never
+}[keyof T]
+type AiTextGenerationModels = Exclude<
+	value2key<AiModels, BaseAiTextGeneration>,
+	value2key<AiModels, BaseAiTextToImage>
+>
 
 function getOpenAiModel(modelName: string) {
-  if (!env.OPENAI_API_KEY) {
-    throw new Error("No API token set!");
-  }
-  const ai = createOpenAI({
-    apiKey: env.OPENAI_API_KEY,
-  });
+	if (!env.OPENAI_API_KEY) {
+		throw new Error('No API token set!')
+	}
+	const ai = createOpenAI({
+		apiKey: env.OPENAI_API_KEY,
+	})
 
-  const model = ai(modelName);
+	const model = ai(modelName)
 
-  return { modelName, model, ai };
+	return { modelName, model, ai }
 }
 
 function getWorkersAiModel(modelName: AiTextGenerationModels) {
-    if (!env.AI) {
-        throw new Error("No AI binding provided!")
-    }
+	if (!env.AI) {
+		throw new Error('No AI binding provided!')
+	}
 
-    const ai = createWorkersAI({ binding: env.AI });
+	const ai = createWorkersAI({ binding: env.AI })
 
-    const model = ai(modelName)
-    return { modelName, model, ai }
+	const model = ai(modelName)
+	return { modelName, model, ai }
 }
 
 export const eachModel = describe.each([
-  getOpenAiModel("gpt-4o"),
-  getOpenAiModel("gpt-4o-mini"),
-  
-  // llama 3 is somewhat inconsistent
-  //getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
-  // Currently llama 4 is having issues with tool calling
-  //getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")
-
-  // TODO: add Claude, Gemini, new OpenAI models via AI gateway
-]);
+	getOpenAiModel('gpt-4o'),
+	getOpenAiModel('gpt-4o-mini'),
+
+	// llama 3 is somewhat inconsistent
+	//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
+	// Currently llama 4 is having issues with tool calling
+	//getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")
+
+	// TODO: add Claude, Gemini, new OpenAI models via AI gateway
+])
diff --git a/packages/eval-tools/wrangler.json b/packages/eval-tools/wrangler.json