CrazyForks
diff --git a/‎.github/workflows/evals.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/evals.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎apps/demo-day/package.json‎
Lines changed: 1 addition & 0 deletions b/‎apps/demo-day/package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/sandbox-container/package.json‎
Lines changed: 2 additions & 1 deletion b/‎apps/sandbox-container/package.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/sandbox-container/server/index.ts‎
Lines changed: 1 addition & 3 deletions b/‎apps/sandbox-container/server/index.ts‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎apps/workers-bindings/evals/accounts.eval.ts‎
Lines changed: 54 additions & 0 deletions b/‎apps/workers-bindings/evals/accounts.eval.ts‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎apps/workers-bindings/evals/hyperdrive.eval.ts‎
Lines changed: 38 additions & 0 deletions b/‎apps/workers-bindings/evals/hyperdrive.eval.ts‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎apps/workers-bindings/evals/kv_namespaces.eval.ts‎
Lines changed: 114 additions & 0 deletions b/‎apps/workers-bindings/evals/kv_namespaces.eval.ts‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎apps/workers-bindings/test/types.d.ts‎ ‎apps/workers-bindings/evals/types.d.ts‎apps/workers-bindings/test/types.d.ts renamed to apps/workers-bindings/evals/types.d.ts b/‎apps/workers-bindings/test/types.d.ts‎ ‎apps/workers-bindings/evals/types.d.ts‎apps/workers-bindings/test/types.d.ts renamed to apps/workers-bindings/evals/types.d.ts
diff --git a/‎apps/workers-bindings/evals/utils.ts‎
Lines changed: 88 additions & 0 deletions b/‎apps/workers-bindings/evals/utils.ts‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎apps/workers-bindings/package.json‎
Lines changed: 8 additions & 0 deletions b/‎apps/workers-bindings/package.json‎
Lines changed: 8 additions & 0 deletions
@@ -22,9 +22,12 @@ jobs:
       - name: Create .dev.vars file
         run: |
           echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
+          echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
       - name: Verify .dev.vars file
         run: |
           du -h ./apps/sandbox-container/.dev.vars
+          du -h ./apps/workers-bindings/.dev.vars
       - name: Install dependencies
         run: pnpm install
       - name: Run evals
 
@@ -14,6 +14,7 @@
 		"@modelcontextprotocol/sdk": "1.10.2",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
+		"@types/node": "22.14.1",
 		"agents": "0.0.67",
 		"zod": "3.24.2"
 	},
 
@@ -26,6 +26,7 @@
 		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
+		"@types/node": "22.14.1",
 		"agents": "0.0.67",
 		"cron-schedule": "5.0.4",
 		"esbuild": "0.25.1",
@@ -40,7 +41,7 @@
 		"@cloudflare/vitest-pool-workers": "0.8.14",
 		"@types/mock-fs": "4.13.4",
 		"@types/node": "22.14.1",
-		"ai": "4.3.6",
+		"ai": "4.3.10",
 		"concurrently": "9.1.2",
 		"mock-fs": "5.5.0",
 		"start-server-and-test": "2.0.11",
 
@@ -78,6 +78,4 @@ export default {
 			clientRegistrationEndpoint: '/register',
 		}).fetch(req, env, ctx)
 	},
-} /*
-	
-*/
+}
@@ -0,0 +1,54 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+// Define a mock account ID for testing
+const MOCK_ACCOUNT_ID = 'mock-account-12345'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('List Cloudflare Accounts', {
+		data: async () => [
+			{
+				input: 'List all my Cloudflare accounts.',
+				expected: 'The accounts_list tool should be called to retrieve the list of accounts.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'accounts_list')
+			expect(toolCall, 'Tool accounts_list was not called').toBeDefined()
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+	describeEval('Set Active Cloudflare Account', {
+		data: async () => [
+			{
+				input: `Set my active Cloudflare account to ${MOCK_ACCOUNT_ID}.`,
+				expected: `The set_active_account tool should be called with the account ID ${MOCK_ACCOUNT_ID}.`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'set_active_account')
+			expect(toolCall, 'Tool set_active_account was not called').toBeDefined()
+
+			expect(toolCall?.args, 'Arguments for set_active_account did not match').toEqual(
+				expect.objectContaining({ activeAccountIdParam: MOCK_ACCOUNT_ID })
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
@@ -0,0 +1,38 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+const HYPERDRIVE_NAME = 'neon-test-hyperdrive'
+const HYPERDRIVE_DATABASE = 'neondb'
+const HYPERDRIVE_HOST = 'ep-late-cell-a4fm3g5p-pooler.us-east-1.aws.neon.tech'
+const HYPERDRIVE_PORT = 5432
+const HYPERDRIVE_USER = 'neondb_owner'
+const HYPERDRIVE_PASSWORD = 'my-test-password'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Hyperdrive Tool Evaluations', {
+		data: async () => [
+			{
+				input: `Create a new Hyperdrive configuration with the name "${HYPERDRIVE_NAME}" and the database "${HYPERDRIVE_DATABASE}" and the host "${HYPERDRIVE_HOST}" and the port "${HYPERDRIVE_PORT}" and the user "${HYPERDRIVE_USER}" and the password "${HYPERDRIVE_PASSWORD}".`,
+				expected:
+					'The hyperdrive_configs_create tool should be called to create a new hyperdrive configuration.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'hyperdrive_config_create')
+			expect(toolCall, 'Tool hyperdrive_configs_create was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
@@ -0,0 +1,114 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Create Cloudflare KV Namespace', {
+		data: async () => [
+			{
+				input: 'Create a new Cloudflare KV Namespace called "my-test-namespace".',
+				expected: 'The kv_namespaces_create tool should be called to create a new kv namespace.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_create')
+			expect(toolCall, 'Tool kv_namespace_create was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+	describeEval('List Cloudflare KV Namespaces', {
+		data: async () => [
+			{
+				input: 'List all my Cloudflare KV Namespaces.',
+				expected:
+					'The kv_namespaces_list tool should be called to retrieve the list of kv namespaces. There should be at least one kv namespace in the list.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
+			expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+	describeEval('Rename Cloudflare KV Namespace', {
+		data: async () => [
+			{
+				input:
+					'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
+				expected: 'The kv_namespace_update tool should be called to rename the kv namespace.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_update')
+			expect(toolCall, 'Tool kv_namespace_update was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+	describeEval('Get Cloudflare KV Namespace Details', {
+		data: async () => [
+			{
+				input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
+				expected:
+					'The kv_namespace_get tool should be called to retrieve the details of the kv namespace.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+
+			console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
+			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_get')
+			expect(toolCall, 'Tool kv_namespace_get was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+	describeEval('Delete Cloudflare KV Namespace', {
+		data: async () => [
+			{
+				input: 'Look up the id of my only KV namespace and delete it.',
+				expected: 'The kv_namespace_delete tool should be called to delete the kv namespace.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient(/* Pass necessary mocks/config */)
+			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+
+			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_delete')
+			expect(toolCall, 'Tool kv_namespace_delete was not called').toBeDefined()
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000, // 60 seconds
+	})
+})
@@ -0,0 +1,88 @@
+import { MCPClientManager } from 'agents/mcp/client'
+import { jsonSchema, streamText, tool } from 'ai'
+import { z } from 'zod'
+
+import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
+
+export async function initializeClient(): Promise<MCPClientManager> {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8977/sse')
+	return clientManager
+}
+
+export async function runTask(
+	clientManager: MCPClientManager,
+	model: LanguageModelV1,
+	input: string
+): Promise<{
+	promptOutput: string
+	fullResult: StreamTextResult<ToolSet, never>
+	toolCalls: ToolCallPart[]
+}> {
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		if (!v.inputSchema.properties) {
+			v.inputSchema.properties = {}
+		}
+
+		acc[v.name] = tool({
+			parameters: jsonSchema(v.inputSchema as any),
+			description: v.description,
+			execute: async (args: any, opts) => {
+				try {
+					const res = await clientManager.callTool(
+						{
+							...v,
+							arguments: { ...args },
+						},
+						z.any() as any,
+						{ signal: opts.abortSignal }
+					)
+					return res.content
+				} catch (e) {
+					console.log('Error calling tool')
+					console.log(e)
+					return e
+				}
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	for await (const part of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const toolCalls: ToolCallPart[] = []
+	const response = await res.response
+	const messages = response.messages
+
+	for (const message of messages) {
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+				toolCalls.push(messagePart)
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return { promptOutput: messagesWithTools, fullResult: res, toolCalls }
+}
@@ -8,6 +8,9 @@
 		"deploy": "wrangler deploy",
 		"deploy:staging": "wrangler deploy --env staging",
 		"deploy:production": "wrangler deploy --env production",
+		"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts'",
+		"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:mcp-server-eval-account@workers-for-platforms-dev.cfdata.org --inspector-port 9230",
+		"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
 		"dev": "wrangler dev",
 		"start": "wrangler dev",
 		"types": "wrangler types --include-env=false",
@@ -25,10 +28,15 @@
 		"@cloudflare/workers-oauth-provider": "0.0.5",
 		"@modelcontextprotocol/sdk": "1.10.2",
 		"@n8n/json-schema-to-zod": "1.1.0",
+		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@repo/mcp-observability": "workspace:*",
 		"agents": "0.0.67",
+		"ai": "4.3.10",
+		"concurrently": "9.1.2",
 		"hono": "4.7.6",
+		"start-server-and-test": "2.0.11",
+		"vitest-evals": "0.1.4",
 		"zod": "3.24.2"
 	}
 }