Skip to content
Merged
6 changes: 6 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ jobs:
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" > ./apps/workers-bindings/.dev.vars
echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/sandbox-container/.dev.vars
echo "DEV_CLOUDFLARE_API_TOKEN=${{ secrets.DEV_CLOUDFLARE_API_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
echo "AI_GATEWAY_TOKEN=${{ secrets.AI_GATEWAY_TOKEN }}" >> ./apps/sandbox-container/.dev.vars
echo "AI_GATEWAY_TOKEN=${{ secrets.AI_GATEWAY_TOKEN }}" >> ./apps/workers-bindings/.dev.vars
echo "CLOUDFLARE_ACCOUNT_ID=${{ secrets.CLOUDFLARE_ACCOUNT_ID }}" >> ./apps/sandbox-container/.dev.vars
echo "CLOUDFLARE_ACCOUNT_ID=${{ secrets.CLOUDFLARE_ACCOUNT_ID }}" >> ./apps/workers-bindings/.dev.vars
echo "AI_GATEWAY_ID=${{ secrets.AI_GATEWAY_ID }}" >> ./apps/sandbox-container/.dev.vars
echo "AI_GATEWAY_ID=${{ secrets.AI_GATEWAY_ID }}" >> ./apps/workers-bindings/.dev.vars
- name: Verify .dev.vars file
run: |
du -h ./apps/sandbox-container/.dev.vars
Expand Down
26 changes: 26 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,32 @@
"attachExistingChildren": false,
"autoAttachChildProcesses": false,
"sourceMaps": true // works with or without this line
},
{
"type": "node",
"request": "launch",
"name": "Open inspector with Vitest",
"runtimeExecutable": "npm",
"runtimeArgs": ["run", "eval:dev"],
"console": "integratedTerminal",
"cwd": "${workspaceFolder}/apps/workers-bindings"
},
{
"name": "Attach to Workers Runtime",
"type": "node",
"request": "attach",
"port": 9229,
"cwd": "/",
"resolveSourceMapLocations": null,
"attachExistingChildren": false,
"autoAttachChildProcesses": false
}
],
"compounds": [
{
"name": "Debug Workers tests",
"configurations": ["Open inspector with Vitest", "Attach to Workers Runtime"],
"stopAll": true
}
]
}
3 changes: 3 additions & 0 deletions apps/sandbox-container/server/sandbox.server.context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ export interface Env {
MCP_SERVER_NAME: string
MCP_SERVER_VERSION: string
OPENAI_API_KEY: string
AI_GATEWAY_TOKEN: string
CLOUDFLARE_ACCOUNT_ID: string
AI_GATEWAY_ID: string
MCP_OBJECT: DurableObjectNamespace<ContainerMcpAgent>
CONTAINER_MANAGER: DurableObjectNamespace<ContainerManager>
USER_CONTAINER: DurableObjectNamespace<UserContainer>
Expand Down
3 changes: 3 additions & 0 deletions apps/sandbox-container/types.d.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
declare module 'cloudflare:test' {
interface ProvidedEnv {
OPENAI_API_KEY: 'TODO'
AI_GATEWAY_TOKEN: string
CLOUDFLARE_ACCOUNT_ID: string
AI_GATEWAY_ID: string
AI: Ai
}
}
12 changes: 3 additions & 9 deletions apps/workers-bindings/evals/kv_namespaces.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ eachModel('$modelName', ({ model }) => {
task: async (input: string) => {
const client = await initializeClient(/* Pass necessary mocks/config */)
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolCall = toolCalls.find(
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_create
)
Expand All @@ -41,7 +40,6 @@ eachModel('$modelName', ({ model }) => {
task: async (input: string) => {
const client = await initializeClient(/* Pass necessary mocks/config */)
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolCall = toolCalls.find(
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespaces_list
)
Expand All @@ -56,15 +54,13 @@ eachModel('$modelName', ({ model }) => {
describeEval('Rename Cloudflare KV Namespace', {
data: async () => [
{
input:
'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace".',
input: 'Rename my Cloudflare KV Namespace with ID 1234 to "my-new-test-namespace".',
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_update} tool should be called to rename the kv namespace.`,
},
],
task: async (input: string) => {
const client = await initializeClient(/* Pass necessary mocks/config */)
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolCall = toolCalls.find(
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_update
)
Expand All @@ -79,14 +75,13 @@ eachModel('$modelName', ({ model }) => {
describeEval('Get Cloudflare KV Namespace Details', {
data: async () => [
{
input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
input: 'Get details of my Cloudflare KV Namespace with ID 1234.',
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_get} tool should be called to retrieve the details of the kv namespace.`,
},
],
task: async (input: string) => {
const client = await initializeClient(/* Pass necessary mocks/config */)
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolCall = toolCalls.find(
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_get
)
Expand All @@ -101,14 +96,13 @@ eachModel('$modelName', ({ model }) => {
describeEval('Delete Cloudflare KV Namespace', {
data: async () => [
{
input: 'Look up the id of my only KV namespace and delete it.',
input: 'Delete the kv namespace with ID 1234.',
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_delete} tool should be called to delete the kv namespace.`,
},
],
task: async (input: string) => {
const client = await initializeClient(/* Pass necessary mocks/config */)
const { promptOutput, toolCalls } = await runTask(client, model, input)

const toolCall = toolCalls.find(
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_delete
)
Expand Down
3 changes: 3 additions & 0 deletions apps/workers-bindings/src/bindings.context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@ export interface Env {
DEV_CLOUDFLARE_EMAIL: string
CLOUDFLARE_API_TOKEN: string
OPENAI_API_KEY: string
AI_GATEWAY_TOKEN: string
CLOUDFLARE_ACCOUNT_ID: string
AI_GATEWAY_ID: string
AI: Ai
}
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"fix:deps": "run-fix-deps",
"test:watch": "vitest",
"eval:ci": "run-turbo eval:ci",
"eval:dev": "run-turbo eval:dev",
"update-deps": "syncpack update"
},
"devDependencies": {
Expand Down
3 changes: 3 additions & 0 deletions packages/eval-tools/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
"bin": "bin"
},
"dependencies": {
"@ai-sdk/anthropic": "1.2.11",
"@ai-sdk/google": "1.2.17",
"@ai-sdk/openai": "1.3.20",
"@cloudflare/vitest-pool-workers": "0.8.14",
"agents": "0.0.67",
"ai": "4.3.10",
"ai-gateway-provider": "0.0.6",
"workers-ai-provider": "0.3.0",
"wrangler": "4.10.0",
"zod": "3.24.2"
Expand Down
15 changes: 5 additions & 10 deletions packages/eval-tools/src/runTask.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import { type MCPClientManager } from 'agents/mcp/client'
import { jsonSchema, streamText, tool } from 'ai'
import { generateText, jsonSchema, tool } from 'ai'
import { z } from 'zod'

import type { LanguageModelV1, StreamTextResult, ToolCallPart, ToolSet } from 'ai'
import type { GenerateTextResult, LanguageModelV1, ToolCallPart, ToolSet } from 'ai'

export async function runTask(
clientManager: MCPClientManager,
model: LanguageModelV1,
input: string
): Promise<{
promptOutput: string
fullResult: StreamTextResult<ToolSet, never>
fullResult: GenerateTextResult<ToolSet, never>
toolCalls: ToolCallPart[]
}> {
const tools = clientManager.listTools()
Expand Down Expand Up @@ -43,7 +43,7 @@ export async function runTask(
return acc
}, {} as ToolSet)

const res = streamText({
const res = await generateText({
model,
system:
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
Expand All @@ -53,15 +53,10 @@ export async function runTask(
maxSteps: 10,
})

// we need to consume the fill stream, so this is empty
// eslint-disable-next-line no-empty
for await (const _ of res.fullStream) {
}

// convert into an LLM readable result so our factuality checker can validate tool calls
let messagesWithTools = ''
const toolCalls: ToolCallPart[] = []
const response = await res.response
const response = res.response
const messages = response.messages

for (const message of messages) {
Expand Down
62 changes: 54 additions & 8 deletions packages/eval-tools/src/test-models.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import { createAnthropic } from '@ai-sdk/anthropic'
import { AnthropicMessagesModelId } from '@ai-sdk/anthropic/internal'
import { createGoogleGenerativeAI } from '@ai-sdk/google'
import { GoogleGenerativeAILanguageModel } from '@ai-sdk/google/internal'
import { createOpenAI } from '@ai-sdk/openai'
import { OpenAIChatModelId } from '@ai-sdk/openai/internal'
import { createAiGateway } from 'ai-gateway-provider'
import { env } from 'cloudflare:test'
import { describe } from 'vitest'
import { createWorkersAI } from 'workers-ai-provider'
Expand All @@ -13,15 +19,56 @@ type AiTextGenerationModels = Exclude<
value2key<AiModels, BaseAiTextToImage>
>

function getOpenAiModel(modelName: string) {
if (!env.OPENAI_API_KEY) {
throw new Error('No API token set!')
function getOpenAiModel(modelName: OpenAIChatModelId) {
if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
throw new Error('No AI gateway credentials set!')
}

const aigateway = createAiGateway({
accountId: env.CLOUDFLARE_ACCOUNT_ID,
gateway: env.AI_GATEWAY_ID,
apiKey: env.AI_GATEWAY_TOKEN,
})

const ai = createOpenAI({
apiKey: env.OPENAI_API_KEY,
apiKey: '',
})

const model = ai(modelName)
const model = aigateway([ai(modelName)])

return { modelName, model, ai }
}

function getAnthropicModel(modelName: AnthropicMessagesModelId) {
const aigateway = createAiGateway({
accountId: env.CLOUDFLARE_ACCOUNT_ID,
gateway: env.AI_GATEWAY_ID,
apiKey: env.AI_GATEWAY_TOKEN,
})

const ai = createAnthropic({
apiKey: '',
})

const model = aigateway([ai(modelName)])

return { modelName, model, ai }
}

function getGeminiModel(modelName: GoogleGenerativeAILanguageModel['modelId']) {
if (!env.CLOUDFLARE_ACCOUNT_ID || !env.AI_GATEWAY_ID || !env.AI_GATEWAY_TOKEN) {
throw new Error('No AI gateway credentials set!')
}

const aigateway = createAiGateway({
accountId: env.CLOUDFLARE_ACCOUNT_ID,
gateway: env.AI_GATEWAY_ID,
apiKey: env.AI_GATEWAY_TOKEN,
})

const ai = createGoogleGenerativeAI({ apiKey: '' })

const model = aigateway([ai(modelName)])

return { modelName, model, ai }
}
Expand All @@ -40,11 +87,10 @@ function getWorkersAiModel(modelName: AiTextGenerationModels) {
export const eachModel = describe.each([
getOpenAiModel('gpt-4o'),
getOpenAiModel('gpt-4o-mini'),

// getAnthropicModel('claude-3-5-sonnet-20241022'), TODO: The evals pass with anthropic, but our rate limit is so low with AI wholesaling that we can't use it in CI because it's impossible to get a complete run with the current limits
getGeminiModel('gemini-2.0-flash'),
// llama 3 is somewhat inconsistent
//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
// Currently llama 4 is having issues with tool calling
//getWorkersAiModel("@cf/meta/llama-4-scout-17b-16e-instruct")

// TODO: add Claude, Gemini, new OpenAI models via AI gateway
])
5 changes: 4 additions & 1 deletion packages/eval-tools/wrangler.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
"name": "stub-worker",
"compatibility_date": "2025-04-14",
"vars": {
"OPENAI_API_KEY": "TODO"
"OPENAI_API_KEY": "TODO",
"AI_GATEWAY_TOKEN": "TODO",
"CLOUDFLARE_ACCOUNT_ID": "TODO",
"AI_GATEWAY_ID": "TODO"
},
"ai": {
"binding": "AI"
Expand Down
Loading