cloudflare
diff --git a/‎apps/workers-bindings/evals/accounts.eval.ts‎
Lines changed: 70 additions & 0 deletions b/‎apps/workers-bindings/evals/accounts.eval.ts‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎apps/workers-bindings/test/types.d.ts‎ renamed to ‎apps/workers-bindings/evals/types.d.ts‎ b/‎apps/workers-bindings/test/types.d.ts‎ renamed to ‎apps/workers-bindings/evals/types.d.ts‎
diff --git a/‎apps/workers-bindings/evals/utils.ts‎
Lines changed: 85 additions & 0 deletions b/‎apps/workers-bindings/evals/utils.ts‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎apps/workers-bindings/package.json‎
Lines changed: 8 additions & 0 deletions b/‎apps/workers-bindings/package.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎apps/workers-bindings/test/index.test.ts‎
Lines changed: 0 additions & 7 deletions b/‎apps/workers-bindings/test/index.test.ts‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎apps/workers-bindings/tsconfig.json‎
Lines changed: 1 addition & 1 deletion b/‎apps/workers-bindings/tsconfig.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/workers-bindings/vitest.config.evals.ts‎
Lines changed: 18 additions & 0 deletions b/‎apps/workers-bindings/vitest.config.evals.ts‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,70 @@
+import { expect, test } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+// Placeholder for actual helper functions - adjust path/implementation as needed
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+// Define a mock account ID for testing
+const MOCK_ACCOUNT_ID = 'mock-account-12345'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Account Tool Evaluations', {
+		// Test cases for account tools
+		data: async () => [
+			{
+				input: 'List all my Cloudflare accounts.',
+				expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
+				// No evalMeta needed here
+			},
+			{
+				input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
+				expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
+				// No evalMeta needed here
+			},
+			// TODO: Add more test cases, e.g., edge cases, invalid inputs?
+		],
+		// The core task execution logic, accepting only input
+		task: async (input: string) => {
+			// Initialize the testing client/environment
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+
+			// Run the task (send input to the agent/model)
+			// Ensure runTaskResult matches the defined type or adjust accordingly
+			const { promptOutput, toolCalls }: RunTaskResult = await runTask(client, model, input)
+
+			// Assertions based on the input
+			if (input.includes('List all my Cloudflare accounts')) {
+				const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
+				expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
+			} else if (input.includes(`Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}`)) {
+				const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
+				expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
+
+				// Check arguments passed to set_active_account
+				expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
+					expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
+				)
+
+				// Specific check for set_active_account: verify the agent's state was updated
+				// This requires agent instance to be returned from runTask
+				const activeAccountId = await agent.getActiveAccountId() // Assuming agent has this method
+				expect(
+					activeAccountId,
+					'Agent activeAccountId was not updated after set_active_account call'
+				).toBe(MOCK_ACCOUNT_ID)
+			}
+
+			// Return the model's final output for scoring
+			return promptOutput
+		},
+		// Scoring functions to evaluate the outcome against the 'expected' description
+		scorers: [checkFactuality],
+		// Passing threshold (1 = perfect score required)
+		threshold: 1,
+		// Timeout per test case
+		timeout: 60000, // 60 seconds
+	})
+})
@@ -0,0 +1,85 @@
+import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
+import { MCPClientManager } from 'agents/mcp/client'
+import { streamText, tool } from 'ai'
+import { z } from 'zod'
+
+import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
+import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+
+export async function initializeClient(): Promise<MCPClientManager> {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8976/sse')
+	return clientManager
+}
+
+export async function runTask(
+	clientManager: MCPClientManager,
+	model: LanguageModelV1,
+	input: string
+): Promise<{
+	promptOutput: string
+	fullResult: StreamTextResult<ToolSet, never>
+	toolCalls: ToolCallPart[]
+}> {
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		acc[v.name] = tool({
+			parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
+			description: v.description,
+			execute: async (args, opts) => {
+				try {
+					const res = await clientManager.callTool(
+						{
+							...v,
+							arguments: { ...args },
+						},
+						z.any() as any,
+						{ signal: opts.abortSignal }
+					)
+					return res.content
+				} catch (e) {
+					console.log('Error calling tool')
+					console.log(e)
+					return e
+				}
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	for await (const part of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const toolCalls: ToolCallPart[] = []
+	const messages = (await res.response).messages
+	for (const message of messages) {
+		console.log(message.content)
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+				toolCalls.push(messagePart)
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
+}
@@ -8,6 +8,9 @@
 		"deploy": "wrangler deploy",
 		"deploy:staging": "wrangler deploy --env staging",
 		"deploy:production": "wrangler deploy --env production",
+		"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
+		"eval:server": "concurrently \"tsx container/index.ts\" \"wrangler dev --var \"ENVIRONMENT:test\"\"",
+		"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8976 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
 		"dev": "wrangler dev",
 		"start": "wrangler dev",
 		"types": "wrangler types --include-env=false",
@@ -25,10 +28,15 @@
 		"@cloudflare/workers-oauth-provider": "0.0.3",
 		"@modelcontextprotocol/sdk": "1.10.2",
 		"@n8n/json-schema-to-zod": "1.1.0",
+		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
 		"agents": "0.0.67",
+		"ai": "4.3.6",
+		"concurrently": "9.1.2",
 		"hono": "4.7.6",
+		"start-server-and-test": "2.0.11",
+		"vitest-evals": "0.1.4",
 		"zod": "3.24.2"
 	}
 }
@@ -1,4 +1,4 @@
 {
 	"extends": "@repo/typescript-config/workers.json",
-	"include": ["*/**.ts"]
+	"include": ["*/**.ts", "./vitest.config.evals.ts"]
 }
@@ -0,0 +1,18 @@
+import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
+
+export default defineWorkersConfig({
+	test: {
+		include: ['**/*.eval.?(c|m)[jt]s?(x)'],
+		poolOptions: {
+			workers: {
+				isolatedStorage: true,
+				wrangler: { configPath: './wrangler.jsonc' },
+				miniflare: {
+					bindings: {
+						ENVIRONMENT: 'test',
+					},
+				},
+			},
+		},
+	},
+})
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`	`2`	`"extends": "@repo/typescript-config/workers.json",`
`3`		`- "include": ["/*.ts"]`
	`3`	`+ "include": ["/*.ts", "./vitest.config.evals.ts"]`
`4`	`4`	`}`