Skip to content

Commit 84462ab

Browse files
committed
feat: working gemini evals
1 parent 830069b commit 84462ab

File tree

4 files changed

+31
-9
lines changed

4 files changed

+31
-9
lines changed

.vscode/launch.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,32 @@
1010
"attachExistingChildren": false,
1111
"autoAttachChildProcesses": false,
1212
"sourceMaps": true // works with or without this line
13+
},
14+
{
15+
"type": "node",
16+
"request": "launch",
17+
"name": "Open inspector with Vitest",
18+
"runtimeExecutable": "npm",
19+
"runtimeArgs": ["run", "eval:dev"],
20+
"console": "integratedTerminal",
21+
"cwd": "${workspaceFolder}/apps/workers-bindings"
22+
},
23+
{
24+
"name": "Attach to Workers Runtime",
25+
"type": "node",
26+
"request": "attach",
27+
"port": 9229,
28+
"cwd": "/",
29+
"resolveSourceMapLocations": null,
30+
"attachExistingChildren": false,
31+
"autoAttachChildProcesses": false
32+
}
33+
],
34+
"compounds": [
35+
{
36+
"name": "Debug Workers tests",
37+
"configurations": ["Open inspector with Vitest", "Attach to Workers Runtime"],
38+
"stopAll": true
1339
}
1440
]
1541
}

apps/workers-bindings/evals/kv_namespaces.eval.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ eachModel('$modelName', ({ model }) => {
1919
task: async (input: string) => {
2020
const client = await initializeClient(/* Pass necessary mocks/config */)
2121
const { promptOutput, toolCalls } = await runTask(client, model, input)
22-
console.log('Creating kv namespace', JSON.stringify(toolCalls, null, 2))
2322
const toolCall = toolCalls.find(
2423
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_create
2524
)
@@ -41,7 +40,6 @@ eachModel('$modelName', ({ model }) => {
4140
task: async (input: string) => {
4241
const client = await initializeClient(/* Pass necessary mocks/config */)
4342
const { promptOutput, toolCalls } = await runTask(client, model, input)
44-
console.log('Listing kv namespaces', JSON.stringify(toolCalls, null, 2))
4543
const toolCall = toolCalls.find(
4644
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespaces_list
4745
)
@@ -57,14 +55,13 @@ eachModel('$modelName', ({ model }) => {
5755
data: async () => [
5856
{
5957
input:
60-
'Rename my Cloudflare KV Namespace called "my-test-namespace" to "my-new-test-namespace". Assume the namespace exists. No need to look it up.',
58+
'Rename my Cloudflare KV Namespace with ID 1234 to "my-new-test-namespace".',
6159
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_update} tool should be called to rename the kv namespace.`,
6260
},
6361
],
6462
task: async (input: string) => {
6563
const client = await initializeClient(/* Pass necessary mocks/config */)
6664
const { promptOutput, toolCalls } = await runTask(client, model, input)
67-
console.log('Renaming kv namespace', JSON.stringify(toolCalls, null, 2))
6865
const toolCall = toolCalls.find(
6966
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_update
7067
)
@@ -79,14 +76,13 @@ eachModel('$modelName', ({ model }) => {
7976
describeEval('Get Cloudflare KV Namespace Details', {
8077
data: async () => [
8178
{
82-
input: 'Get details of my Cloudflare KV Namespace called "my-new-test-namespace".',
79+
input: 'Get details of my Cloudflare KV Namespace with ID 1234.',
8380
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_get} tool should be called to retrieve the details of the kv namespace.`,
8481
},
8582
],
8683
task: async (input: string) => {
8784
const client = await initializeClient(/* Pass necessary mocks/config */)
8885
const { promptOutput, toolCalls } = await runTask(client, model, input)
89-
console.log('Getting kv namespace details', JSON.stringify(toolCalls, null, 2))
9086
const toolCall = toolCalls.find(
9187
(call) => call.toolName === KV_NAMESPACE_TOOLS.kv_namespace_get
9288
)
@@ -101,7 +97,7 @@ eachModel('$modelName', ({ model }) => {
10197
describeEval('Delete Cloudflare KV Namespace', {
10298
data: async () => [
10399
{
104-
input: 'Delete the "my-new-test-namespace" kv namespace.',
100+
input: 'Delete the kv namespace with ID 1234.',
105101
expected: `The ${KV_NAMESPACE_TOOLS.kv_namespace_delete} tool should be called to delete the kv namespace.`,
106102
},
107103
],

apps/workers-bindings/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"deploy": "wrangler deploy",
99
"deploy:staging": "wrangler deploy --env staging",
1010
"deploy:production": "wrangler deploy --env production",
11-
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest kv_namespaces.eval.ts --testTimeout=60000 --config vitest.config.evals.ts'",
11+
"eval:dev": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest --testTimeout=60000 --config vitest.config.evals.ts --inspect=9229 --no-file-parallelism'",
1212
"eval:server": "wrangler dev --var ENVIRONMENT:test --var DEV_DISABLE_OAUTH:true --var DEV_CLOUDFLARE_EMAIL:[email protected] --inspector-port 9230 --port 8977",
1313
"eval:ci": "start-server-and-test --expect 404 eval:server http://localhost:8977 'vitest run --testTimeout=60000 --config vitest.config.evals.ts'",
1414
"dev": "wrangler dev",

packages/eval-tools/src/test-models.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ export const eachModel = describe.each([
8585
getOpenAiModel('gpt-4o'),
8686
getOpenAiModel('gpt-4o-mini'),
8787
getAnthropicModel('claude-3-5-sonnet-20241022'),
88-
getGeminiModel('gemini-2.5-pro-exp-03-25')
88+
getGeminiModel('gemini-2.0-flash')
8989
// llama 3 is somewhat inconsistent
9090
//getWorkersAiModel("@cf/meta/llama-3.3-70b-instruct-fp8-fast")
9191
// Currently llama 4 is having issues with tool calling

0 commit comments

Comments
 (0)