Move runTask to common evals package

cmsparks · cmsparks · commit 30d1024e7210 · 2025-05-02T17:04:32.000-05:00
diff --git a/apps/sandbox-container/evals/exec.eval.ts b/apps/sandbox-container/evals/exec.eval.ts
@@ -1,10 +1,11 @@
 import { expect } from 'vitest'
 import { describeEval } from 'vitest-evals'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 
-import { initializeClient, runTask } from './utils'
+import { initializeClient } from './utils'
 
 eachModel('$modelName', ({ model }) => {
 	describeEval('Runs a python file in a container', {
diff --git a/apps/sandbox-container/evals/files.eval.ts b/apps/sandbox-container/evals/files.eval.ts
@@ -2,10 +2,11 @@ import { assert, expect } from 'vitest'
 import { describeEval } from 'vitest-evals'
 import { z } from 'zod'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 
-import { initializeClient, runTask } from './utils'
+import { initializeClient } from './utils'
 
 eachModel('$modelName', ({ model }) => {
 	describeEval('Runs container file write', {
diff --git a/apps/sandbox-container/evals/initialize.eval.ts b/apps/sandbox-container/evals/initialize.eval.ts
@@ -1,9 +1,10 @@
 import { describeEval } from 'vitest-evals'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 
-import { initializeClient, runTask } from './utils'
+import { initializeClient } from './utils'
 
 eachModel('$modelName', ({ model }) => {
 	describeEval('Runs container initialize', {
diff --git a/apps/sandbox-container/evals/utils.ts b/apps/sandbox-container/evals/utils.ts
@@ -1,89 +1,7 @@
 import { MCPClientManager } from 'agents/mcp/client'
-import { jsonSchema, streamText, tool } from 'ai'
-import { z } from 'zod'
-
-import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
 
 export async function initializeClient(): Promise<MCPClientManager> {
 	const clientManager = new MCPClientManager('test-client', '0.0.0')
 	await clientManager.connect('http://localhost:8976/sse')
 	return clientManager
 }
-
-export async function runTask(
-	clientManager: MCPClientManager,
-	model: LanguageModelV1,
-	input: string
-): Promise<{
-	promptOutput: string
-	fullResult: StreamTextResult<ToolSet, never>
-	toolCalls: ToolCallPart[]
-}> {
-	const tools = clientManager.listTools()
-	const toolSet: ToolSet = tools.reduce((acc, v) => {
-		if (!v.inputSchema.properties) {
-			v.inputSchema.properties = {}
-		}
-
-		acc[v.name] = tool({
-			parameters: jsonSchema(v.inputSchema as any),
-			description: v.description,
-			execute: async (args: any, opts) => {
-				try {
-					const res = await clientManager.callTool(
-						{
-							...v,
-							arguments: { ...args },
-						},
-						z.any() as any,
-						{ signal: opts.abortSignal }
-					)
-					return res.content
-				} catch (e) {
-					console.log('Error calling tool')
-					console.log(e)
-					return e
-				}
-			},
-		})
-		return acc
-	}, {} as ToolSet)
-
-	const res = streamText({
-		model,
-		system:
-			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
-		tools: toolSet,
-		prompt: input,
-		maxRetries: 1,
-		maxSteps: 10,
-	})
-
-	// consume the stream
-	// eslint-disable-next-line no-empty
-	for await (const _ of res.fullStream) {
-	}
-
-	// convert into an LLM readable result so our factuality checker can validate tool calls
-	let messagesWithTools = ''
-	const toolCalls: ToolCallPart[] = []
-	const messages = (await res.response).messages
-	for (const message of messages) {
-		console.log(message.content)
-		for (const messagePart of message.content) {
-			if (typeof messagePart === 'string') {
-				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
-			} else if (messagePart.type === 'tool-call') {
-				messagesWithTools += `<message_content type=${messagePart.type}>
-    <tool_name>${messagePart.toolName}</tool_name>
-    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
-</message_content>`
-				toolCalls.push(messagePart)
-			} else if (messagePart.type === 'text') {
-				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
-			}
-		}
-	}
-
-	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
-}
diff --git a/apps/workers-bindings/evals/accounts.eval.ts b/apps/workers-bindings/evals/accounts.eval.ts
@@ -1,10 +1,11 @@
 import { expect } from 'vitest'
 import { describeEval } from 'vitest-evals'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 
-import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+import { initializeClient } from './utils' // Assuming utils.ts will exist here
 
 // Define a mock account ID for testing
 const MOCK_ACCOUNT_ID = 'mock-account-12345'
diff --git a/apps/workers-bindings/evals/hyperdrive.eval.ts b/apps/workers-bindings/evals/hyperdrive.eval.ts
@@ -1,11 +1,12 @@
 import { expect } from 'vitest'
 import { describeEval } from 'vitest-evals'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 import { HYPERDRIVE_TOOLS } from '@repo/mcp-common/src/tools/hyperdrive'
 
-import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+import { initializeClient } from './utils' // Assuming utils.ts will exist here
 
 // TODO: Add test for creating hyperdrive config with the following params once we can securely pass parameters to the tool. See: https://github.com/modelcontextprotocol/modelcontextprotocol/pull/382
 // const HYPERDRIVE_NAME = 'neon-test-hyperdrive'
diff --git a/apps/workers-bindings/evals/kv_namespaces.eval.ts b/apps/workers-bindings/evals/kv_namespaces.eval.ts
@@ -1,11 +1,12 @@
 import { expect } from 'vitest'
 import { describeEval } from 'vitest-evals'
 
+import { runTask } from '@repo/eval-tools/src/runTask'
 import { checkFactuality } from '@repo/eval-tools/src/scorers'
 import { eachModel } from '@repo/eval-tools/src/test-models'
 import { KV_NAMESPACE_TOOLS } from '@repo/mcp-common/src/tools/kv_namespace'
 
-import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+import { initializeClient } from './utils' // Assuming utils.ts will exist here
 
 eachModel('$modelName', ({ model }) => {
 	describeEval('Create Cloudflare KV Namespace', {
diff --git a/apps/workers-bindings/evals/utils.ts b/apps/workers-bindings/evals/utils.ts
@@ -1,90 +1,7 @@
 import { MCPClientManager } from 'agents/mcp/client'
-import { jsonSchema, streamText, tool } from 'ai'
-import { z } from 'zod'
-
-import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
 
 export async function initializeClient(): Promise<MCPClientManager> {
 	const clientManager = new MCPClientManager('test-client', '0.0.0')
 	await clientManager.connect('http://localhost:8977/sse')
 	return clientManager
 }
-
-export async function runTask(
-	clientManager: MCPClientManager,
-	model: LanguageModelV1,
-	input: string
-): Promise<{
-	promptOutput: string
-	fullResult: StreamTextResult<ToolSet, never>
-	toolCalls: ToolCallPart[]
-}> {
-	const tools = clientManager.listTools()
-	const toolSet: ToolSet = tools.reduce((acc, v) => {
-		if (!v.inputSchema.properties) {
-			v.inputSchema.properties = {}
-		}
-
-		acc[v.name] = tool({
-			parameters: jsonSchema(v.inputSchema as any),
-			description: v.description,
-			execute: async (args: any, opts) => {
-				try {
-					const res = await clientManager.callTool(
-						{
-							...v,
-							arguments: { ...args },
-						},
-						z.any() as any,
-						{ signal: opts.abortSignal }
-					)
-					return res.content
-				} catch (e) {
-					console.log('Error calling tool')
-					console.log(e)
-					return e
-				}
-			},
-		})
-		return acc
-	}, {} as ToolSet)
-
-	const res = streamText({
-		model,
-		system:
-			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
-		tools: toolSet,
-		prompt: input,
-		maxRetries: 1,
-		maxSteps: 10,
-	})
-
-	// we need to consume the fill stream, so this is empty
-	// eslint-disable-next-line no-empty
-	for await (const _ of res.fullStream) {
-	}
-
-	// convert into an LLM readable result so our factuality checker can validate tool calls
-	let messagesWithTools = ''
-	const toolCalls: ToolCallPart[] = []
-	const response = await res.response
-	const messages = response.messages
-
-	for (const message of messages) {
-		for (const messagePart of message.content) {
-			if (typeof messagePart === 'string') {
-				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
-			} else if (messagePart.type === 'tool-call') {
-				messagesWithTools += `<message_content type=${messagePart.type}>
-    <tool_name>${messagePart.toolName}</tool_name>
-    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
-</message_content>`
-				toolCalls.push(messagePart)
-			} else if (messagePart.type === 'text') {
-				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
-			}
-		}
-	}
-
-	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
-}
diff --git a/packages/eval-tools/package.json b/packages/eval-tools/package.json
@@ -13,6 +13,7 @@
 	"dependencies": {
 		"@ai-sdk/openai": "1.3.20",
 		"@cloudflare/vitest-pool-workers": "0.8.14",
+		"agents": "0.0.67",
 		"ai": "4.3.10",
 		"workers-ai-provider": "0.3.0",
 		"wrangler": "4.10.0",
diff --git a/packages/eval-tools/src/runTask.ts b/packages/eval-tools/src/runTask.ts
@@ -0,0 +1,84 @@
+import { type MCPClientManager } from 'agents/mcp/client'
+import { jsonSchema, streamText, tool } from 'ai'
+import { z } from 'zod'
+
+import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+
+export async function runTask(
+	clientManager: MCPClientManager,
+	model: LanguageModelV1,
+	input: string
+): Promise<{
+	promptOutput: string
+	fullResult: StreamTextResult<ToolSet, never>
+	toolCalls: ToolCallPart[]
+}> {
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		if (!v.inputSchema.properties) {
+			v.inputSchema.properties = {}
+		}
+
+		acc[v.name] = tool({
+			parameters: jsonSchema(v.inputSchema as any),
+			description: v.description,
+			execute: async (args: any, opts) => {
+				try {
+					const res = await clientManager.callTool(
+						{
+							...v,
+							arguments: { ...args },
+						},
+						z.any() as any,
+						{ signal: opts.abortSignal }
+					)
+					return res.content
+				} catch (e) {
+					console.log('Error calling tool')
+					console.log(e)
+					return e
+				}
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	// we need to consume the fill stream, so this is empty
+	// eslint-disable-next-line no-empty
+	for await (const _ of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const toolCalls: ToolCallPart[] = []
+	const response = await res.response
+	const messages = response.messages
+
+	for (const message of messages) {
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+				toolCalls.push(messagePart)
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml