cloudflare
diff --git a/‎.github/workflows/evals.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/evals.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/sandbox-container/README.md‎
Lines changed: 1 addition & 1 deletion b/‎apps/sandbox-container/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/sandbox-container/evals/env.d.ts‎
Lines changed: 3 additions & 0 deletions b/‎apps/sandbox-container/evals/env.d.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎apps/sandbox-container/evals/initialize.eval.ts‎
Lines changed: 25 additions & 0 deletions b/‎apps/sandbox-container/evals/initialize.eval.ts‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎apps/sandbox-container/evals/utils.ts‎
Lines changed: 58 additions & 0 deletions b/‎apps/sandbox-container/evals/utils.ts‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎apps/sandbox-container/package.json‎
Lines changed: 11 additions & 3 deletions b/‎apps/sandbox-container/package.json‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎apps/sandbox-container/server/index.ts‎
Lines changed: 4 additions & 2 deletions b/‎apps/sandbox-container/server/index.ts‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎apps/sandbox-container/tsconfig.json‎
Lines changed: 3 additions & 14 deletions b/‎apps/sandbox-container/tsconfig.json‎
Lines changed: 3 additions & 14 deletions
diff --git a/‎apps/sandbox-container/vitest.config.evals.ts‎
Lines changed: 13 additions & 0 deletions b/‎apps/sandbox-container/vitest.config.evals.ts‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,28 @@
+name: Evals
+on:
+  push:
+
+jobs:
+  test:
+    runs-on: ubuntu-24.04
+    strategy:
+      matrix:
+        node-version: [22]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pnpm
+        uses: pnpm/action-setup@v4
+        with:
+          version: 10.8.0
+      - name: Use Node.js ${{ matrix.node-version }}
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: 'pnpm'
+      - name: Create .dev.vars file
+        run: |
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/sandbox-container/.dev.vars
+      - name: Install dependencies
+        run: pnpm install
+      - name: Run evals
+        run: pnpm eval
@@ -20,7 +20,7 @@ Replace the content with the following configuration. Once you restart Claude De
       "command": "npx",
       "args": [
         "mcp-remote",
-        "https://mcp.cloudflare.com/workers/observability/sse"
+        "https://observability.mcp.cloudflare.com/sse"
       ]
     }
   }
 
@@ -12,7 +12,7 @@ Do the following from within the sandbox-container app:
 2. Get the Cloudflare client id and secret from a team member and add them to the `.dev.vars` file.
 3. Run `pnpm i` then `pnpm dev` to start the MCP server.
 4. Run `pnpx @modelcontextprotocol/inspector` to start the MCP inspector client.
-5. Open the inspector client in your browser and connect to the server via `http://localhost:8976/workers/sandbox/sse`.
+5. Open the inspector client in your browser and connect to the server via `http://localhost:8976/sse`.
 
 Note: Temporary files created through files tool calls are stored in the workdir folder of this app.
 
 
@@ -0,0 +1,3 @@
+declare module 'cloudflare:test' {
+	interface ProvidedEnv extends Env {}
+}
@@ -0,0 +1,25 @@
+import { MCPClientManager } from 'agents/mcp/client'
+import { generateText, tool, ToolExecutionOptions, ToolSet } from 'ai'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { runTask } from './utils'
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Runs container initialize', {
+		data: async () => [
+			{
+				input: 'create and ping a container',
+				expected:
+					'The container_initialize tool was called and then the container_ping tool was called',
+			},
+		],
+		task: async (input) => {
+			return await runTask(model, input)
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+	})
+})
@@ -0,0 +1,58 @@
+import { jsonSchemaToZod } from '@n8n/json-schema-to-zod'
+import { MCPClientManager } from 'agents/mcp/client'
+import { LanguageModelV1, streamText, tool, ToolSet } from 'ai'
+
+import type { JsonSchemaObject } from '@n8n/json-schema-to-zod'
+
+export async function runTask(model: LanguageModelV1, input: string) {
+	const clientManager = new MCPClientManager('test-client', '0.0.0')
+	await clientManager.connect('http://localhost:8787/sse')
+
+	const tools = clientManager.listTools()
+	const toolSet: ToolSet = tools.reduce((acc, v) => {
+		acc[v.name] = tool({
+			parameters: jsonSchemaToZod(v.inputSchema as JsonSchemaObject),
+			description: v.description,
+			execute: async (args, opts) => {
+				const res = await clientManager.callTool(v, args, { signal: opts.abortSignal })
+				console.log(res.toolResult)
+				return res.content
+			},
+		})
+		return acc
+	}, {} as ToolSet)
+
+	const res = streamText({
+		model,
+		system:
+			"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
+		tools: toolSet,
+		prompt: input,
+		maxRetries: 1,
+		maxSteps: 10,
+	})
+
+	for await (const part of res.fullStream) {
+	}
+
+	// convert into an LLM readable result so our factuality checker can validate tool calls
+	let messagesWithTools = ''
+	const messages = (await res.response).messages
+	for (const message of messages) {
+		console.log(message.content)
+		for (const messagePart of message.content) {
+			if (typeof messagePart === 'string') {
+				messagesWithTools += `<message_content type="text">${messagePart}</message_content>`
+			} else if (messagePart.type === 'tool-call') {
+				messagesWithTools += `<message_content type=${messagePart.type}>
+    <tool_name>${messagePart.toolName}</tool_name>
+    <tool_arguments>${JSON.stringify(messagePart.args)}</tool_arguments>
+</message_content>`
+			} else if (messagePart.type === 'text') {
+				messagesWithTools += `<message_content type=${messagePart.type}>${messagePart.text}</message_content>`
+			}
+		}
+	}
+
+	return messagesWithTools
+}
@@ -12,30 +12,38 @@
 		"start:container": "tsx container/index.ts",
 		"postinstall": "mkdir -p workdir",
 		"test": "vitest",
-		"types": "wrangler types"
+		"types": "wrangler types",
+		"eval:dev": "concurrently \"npm run dev\" \"vitest --config vitest.config.evals.ts\"",
+		"eval": "concurrently \"npm run dev\" \"vitest run --config vitest.config.evals.ts\""
 	},
 	"dependencies": {
 		"@cloudflare/workers-oauth-provider": "0.0.2",
 		"@cloudflare/workers-types": "^4.20250320.0",
 		"@hono/node-server": "^1.13.8",
 		"@hono/zod-validator": "^0.4.3",
-		"@modelcontextprotocol/sdk": "^1.7.0",
+		"@modelcontextprotocol/sdk": "^1.9.0",
+		"@n8n/json-schema-to-zod": "^1.1.0",
+		"@repo/eval-tools": "workspace:*",
 		"@repo/mcp-common": "workspace:*",
 		"@types/node": "^22.13.10",
-		"agents": "^0.0.42",
+		"agents": "^0.0.60",
 		"cron-schedule": "^5.0.4",
 		"esbuild": "^0.25.1",
 		"hono": "^4.7.5",
 		"mime": "^4.0.6",
 		"octokit": "^4.1.2",
 		"partyserver": "^0.0.65",
+		"simple-git-hooks": "^2.12.1",
 		"tsx": "^4.19.3",
+		"vitest-evals": "^0.1.4",
 		"workers-mcp": "0.1.0-3",
 		"zod": "^3.24.2"
 	},
 	"devDependencies": {
 		"@types/mock-fs": "^4.13.4",
 		"mock-fs": "^5.5.0",
+		"@cloudflare/vitest-pool-workers": "0.8.14",
+		"ai": "^4.3.6",
 		"concurrently": "^9.1.2",
 		"wrangler": "^4.9.1"
 	}
 
@@ -17,6 +17,8 @@ export type Env = {
 	CONTAINER_MCP_AGENT: DurableObjectNamespace<ContainerMcpAgent>
 	CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
 	ENVIRONMENT: 'dev' | 'prod'
+	CLOUDFLARE_CLIENT_ID: string
+	CLOUDFLARE_CLIENT_SECRET: string
 }
 
 // Context from the auth process, encrypted & stored in the auth token
@@ -28,9 +30,9 @@ export type Props = {
 }
 
 export default new OAuthProvider({
-	apiRoute: '/workers/sandbox/sse',
+	apiRoute: '/sse',
 	// @ts-ignore
-	apiHandler: ContainerMcpAgent.mount('/workers/sandbox/sse', { binding: 'CONTAINER_MCP_AGENT' }),
+	apiHandler: ContainerMcpAgent.mount('/sse', { binding: 'CONTAINER_MCP_AGENT' }),
 	// @ts-ignore
 	defaultHandler: CloudflareAuthHandler,
 	authorizeEndpoint: '/oauth/authorize',
 
@@ -1,16 +1,5 @@
 {
-	"compilerOptions": {
-		"target": "ESNext",
-		"lib": ["ESNext", "DOM"],
-		"jsx": "react-jsx",
-		"module": "ESNext",
-		"moduleResolution": "bundler",
-		"types": ["./worker-configuration.d.ts", "@cloudflare/workers-types/2023-07-01"],
-		"noEmit": true,
-		"esModuleInterop": true,
-		"forceConsistentCasingInFileNames": true,
-		"strict": true,
-		"skipLibCheck": true
-	},
-	"include": ["server/**.ts", "shared/**.ts"]
+	"extends": "@repo/typescript-config/workers.json",
+	"include": ["*/**.ts", "./vitest.config.evals.ts"],
+	"exclude": ["container/**.ts"]
 }
@@ -0,0 +1,13 @@
+import { defineWorkersConfig } from '@cloudflare/vitest-pool-workers/config'
+
+export default defineWorkersConfig({
+	test: {
+		include: ['**/*.eval.?(c|m)[jt]s?(x)'],
+		poolOptions: {
+			workers: {
+				isolatedStorage: true,
+				wrangler: { configPath: './wrangler.jsonc' },
+			},
+		},
+	},
+})
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ Replace the content with the following configuration. Once you restart Claude De`
`20`	`20`	`"command": "npx",`
`21`	`21`	`"args": [`
`22`	`22`	`"mcp-remote",`
`23`		`- "https://mcp.cloudflare.com/workers/observability/sse"`
	`23`	`+ "https://observability.mcp.cloudflare.com/sse"`
`24`	`24`	`]`
`25`	`25`	`}`
`26`	`26`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+declare module 'cloudflare:test' {`
	`2`	`+ interface ProvidedEnv extends Env {}`
	`3`	`+}`