Skip to content

Commit 43b3886

Browse files
committed
feat: vectorize tools
1 parent b7102c4 commit 43b3886

File tree

6 files changed

+382
-54
lines changed

6 files changed

+382
-54
lines changed

apps/workers-bindings/evals/kv_namespaces.eval.ts

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ eachModel('$modelName', ({ model }) => {
2525
},
2626
scorers: [checkFactuality],
2727
threshold: 1,
28-
timeout: 60000, // 60 seconds
28+
timeout: 60000,
2929
})
3030
describeEval('List Cloudflare KV Namespaces', {
3131
data: async () => [
@@ -37,7 +37,7 @@ eachModel('$modelName', ({ model }) => {
3737
],
3838
task: async (input: string) => {
3939
const client = await initializeClient(/* Pass necessary mocks/config */)
40-
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
40+
const { promptOutput, toolCalls } = await runTask(client, model, input)
4141

4242
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
4343
expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
@@ -46,7 +46,7 @@ eachModel('$modelName', ({ model }) => {
4646
},
4747
scorers: [checkFactuality],
4848
threshold: 1,
49-
timeout: 60000, // 60 seconds
49+
timeout: 60000,
5050
})
5151
describeEval('Rename Cloudflare KV Namespace', {
5252
data: async () => [
@@ -58,7 +58,7 @@ eachModel('$modelName', ({ model }) => {
5858
],
5959
task: async (input: string) => {
6060
const client = await initializeClient(/* Pass necessary mocks/config */)
61-
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
61+
const { promptOutput, toolCalls } = await runTask(client, model, input)
6262

6363
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_update')
6464
expect(toolCall, 'Tool kv_namespace_update was not called').toBeDefined()
@@ -67,7 +67,7 @@ eachModel('$modelName', ({ model }) => {
6767
},
6868
scorers: [checkFactuality],
6969
threshold: 1,
70-
timeout: 60000, // 60 seconds
70+
timeout: 60000,
7171
})
7272
describeEval('Get Cloudflare KV Namespace Details', {
7373
data: async () => [
@@ -79,17 +79,16 @@ eachModel('$modelName', ({ model }) => {
7979
],
8080
task: async (input: string) => {
8181
const client = await initializeClient(/* Pass necessary mocks/config */)
82-
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
82+
const { promptOutput, toolCalls } = await runTask(client, model, input)
8383

84-
console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
8584
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_get')
8685
expect(toolCall, 'Tool kv_namespace_get was not called').toBeDefined()
8786

8887
return promptOutput
8988
},
9089
scorers: [checkFactuality],
9190
threshold: 1,
92-
timeout: 60000, // 60 seconds
91+
timeout: 60000,
9392
})
9493
describeEval('Delete Cloudflare KV Namespace', {
9594
data: async () => [
@@ -100,7 +99,7 @@ eachModel('$modelName', ({ model }) => {
10099
],
101100
task: async (input: string) => {
102101
const client = await initializeClient(/* Pass necessary mocks/config */)
103-
const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
102+
const { promptOutput, toolCalls } = await runTask(client, model, input)
104103

105104
const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_delete')
106105
expect(toolCall, 'Tool kv_namespace_delete was not called').toBeDefined()
@@ -109,6 +108,6 @@ eachModel('$modelName', ({ model }) => {
109108
},
110109
scorers: [checkFactuality],
111110
threshold: 1,
112-
timeout: 60000, // 60 seconds
111+
timeout: 60000,
113112
})
114113
})
Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
import { expect } from 'vitest'
2+
import { describeEval } from 'vitest-evals'
3+
4+
import { checkFactuality } from '@repo/eval-tools/src/scorers'
5+
import { eachModel } from '@repo/eval-tools/src/test-models'
6+
7+
import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
8+
9+
const MOCK_INDEX_NAME = 'test-vectorize-index'
10+
const MOCK_INDEX_DESCRIPTION = 'A test index for evaluation'
11+
const MOCK_DIMENSIONS = 50
12+
const MOCK_METRIC = 'cosine'
13+
const MOCK_PRESET = '@cf/baai/bge-small-en-v1.5'
14+
const MOCK_VECTOR_ID_1 = 'vec1'
15+
const MOCK_VECTOR_ID_2 = 'vec2'
16+
const MOCK_NDJSON_INSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.1,0.2,0.3],"metadata":{"text":"vector 1"}}\n{"id":"${MOCK_VECTOR_ID_2}","values":[0.4,0.5,0.6],"namespace":"ns1"}`
17+
const MOCK_NDJSON_UPSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.11,0.22,0.33],"metadata":{"text":"updated vector 1"}}`
18+
const MOCK_QUERY_VECTOR = [0.1, 0.2, 0.3]
19+
20+
eachModel('$modelName', ({ model }) => {
21+
describeEval('Create Vectorize Index (Dimensions/Metric)', {
22+
data: async () => [
23+
{
24+
input: `Create a Vectorize index named "${MOCK_INDEX_NAME}" with ${MOCK_DIMENSIONS} dimensions using the "${MOCK_METRIC}" metric. Add description: "${MOCK_INDEX_DESCRIPTION}".`,
25+
expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}", config specifying ${MOCK_DIMENSIONS} dimensions and "${MOCK_METRIC}" metric, and description "${MOCK_INDEX_DESCRIPTION}".`,
26+
},
27+
],
28+
task: async (input: string) => {
29+
const client = await initializeClient()
30+
const { promptOutput, toolCalls } = await runTask(client, model, input)
31+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create')
32+
expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined()
33+
expect(toolCall?.args, 'Arguments did not match').toEqual(
34+
expect.objectContaining({
35+
name: MOCK_INDEX_NAME,
36+
config: expect.objectContaining({
37+
dimensions: MOCK_DIMENSIONS,
38+
metric: MOCK_METRIC,
39+
}),
40+
description: MOCK_INDEX_DESCRIPTION,
41+
})
42+
)
43+
return promptOutput
44+
},
45+
scorers: [checkFactuality],
46+
threshold: 1,
47+
timeout: 60000,
48+
})
49+
50+
// --- Test vectorize_index_create (with preset) ---
51+
describeEval('Create Vectorize Index (Preset)', {
52+
data: async () => [
53+
{
54+
input: `Create a Vectorize index named "${MOCK_INDEX_NAME}-preset" using the "${MOCK_PRESET}" preset.`,
55+
expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}-preset" and config specifying the preset "${MOCK_PRESET}".`,
56+
},
57+
],
58+
task: async (input: string) => {
59+
const client = await initializeClient()
60+
const { promptOutput, toolCalls } = await runTask(client, model, input)
61+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create')
62+
expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined()
63+
expect(toolCall?.args, 'Arguments did not match').toEqual(
64+
expect.objectContaining({
65+
name: `${MOCK_INDEX_NAME}-preset`,
66+
config: expect.objectContaining({
67+
preset: MOCK_PRESET,
68+
}),
69+
})
70+
)
71+
return promptOutput
72+
},
73+
scorers: [checkFactuality],
74+
threshold: 1,
75+
timeout: 60000,
76+
})
77+
78+
// --- Test vectorize_index_list ---
79+
describeEval('List Vectorize Indexes', {
80+
data: async () => [
81+
{
82+
input: 'List my Vectorize indexes.',
83+
expected: 'The vectorize_index_list tool should be called.',
84+
},
85+
{
86+
input: 'Show me page 2 of my Vectorize indexes, 10 per page, ordered by name descending.',
87+
expected:
88+
'The vectorize_index_list tool should be called with page 2, per_page 10, order name, direction desc.',
89+
},
90+
],
91+
task: async (input: string) => {
92+
const client = await initializeClient()
93+
const { promptOutput, toolCalls } = await runTask(client, model, input)
94+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_list')
95+
expect(toolCall, 'Tool vectorize_index_list was not called').toBeDefined()
96+
97+
// Check specific args only for the pagination case
98+
if (input.includes('page 2')) {
99+
expect(toolCall?.args, 'Pagination arguments did not match').toEqual(
100+
expect.objectContaining({
101+
page: 2,
102+
per_page: 10,
103+
order: 'name',
104+
direction: 'desc',
105+
})
106+
)
107+
}
108+
109+
return promptOutput
110+
},
111+
scorers: [checkFactuality],
112+
threshold: 1,
113+
timeout: 60000,
114+
})
115+
116+
// --- Test vectorize_index_get ---
117+
describeEval('Get Vectorize Index Details', {
118+
data: async () => [
119+
{
120+
input: `Get the details for the Vectorize index named "${MOCK_INDEX_NAME}".`,
121+
expected: `The vectorize_index_get tool should be called with name "${MOCK_INDEX_NAME}".`,
122+
},
123+
],
124+
task: async (input: string) => {
125+
const client = await initializeClient()
126+
const { promptOutput, toolCalls } = await runTask(client, model, input)
127+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get')
128+
expect(toolCall, 'Tool vectorize_index_get was not called').toBeDefined()
129+
expect(toolCall?.args, 'Arguments did not match').toEqual(
130+
expect.objectContaining({
131+
name: MOCK_INDEX_NAME,
132+
})
133+
)
134+
return promptOutput
135+
},
136+
scorers: [checkFactuality],
137+
threshold: 1,
138+
timeout: 60000,
139+
})
140+
141+
describeEval('Get Vectorize Index Info', {
142+
data: async () => [
143+
{
144+
input: `Get operational info for the Vectorize index "${MOCK_INDEX_NAME}".`,
145+
expected: `The vectorize_index_info tool should be called with name "${MOCK_INDEX_NAME}".`,
146+
},
147+
],
148+
task: async (input: string) => {
149+
const client = await initializeClient()
150+
const { promptOutput, toolCalls } = await runTask(client, model, input)
151+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_info')
152+
expect(toolCall, 'Tool vectorize_index_info was not called').toBeDefined()
153+
expect(toolCall?.args, 'Arguments did not match').toEqual(
154+
expect.objectContaining({
155+
name: MOCK_INDEX_NAME,
156+
})
157+
)
158+
return promptOutput
159+
},
160+
scorers: [checkFactuality],
161+
threshold: 1,
162+
timeout: 60000,
163+
})
164+
165+
describeEval('Insert Vectors into Index', {
166+
data: async () => [
167+
{
168+
input: `Insert the following vectors into the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_INSERT}`,
169+
expected: `The vectorize_index_insert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data.`,
170+
},
171+
],
172+
task: async (input: string) => {
173+
const client = await initializeClient()
174+
const { promptOutput, toolCalls } = await runTask(client, model, input)
175+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_insert')
176+
expect(toolCall, 'Tool vectorize_index_insert was not called').toBeDefined()
177+
expect(toolCall?.args, 'Arguments did not match').toEqual(
178+
expect.objectContaining({
179+
name: MOCK_INDEX_NAME,
180+
vectors_ndjson: expect.stringContaining(MOCK_VECTOR_ID_1), // Check if body contains expected data
181+
})
182+
)
183+
return promptOutput
184+
},
185+
scorers: [checkFactuality],
186+
threshold: 1,
187+
timeout: 60000,
188+
})
189+
190+
describeEval('Upsert Vectors into Index', {
191+
data: async () => [
192+
{
193+
input: `Upsert these vectors in the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_UPSERT}`,
194+
expected: `The vectorize_index_upsert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data for upserting.`,
195+
},
196+
],
197+
task: async (input: string) => {
198+
const client = await initializeClient()
199+
const { promptOutput, toolCalls } = await runTask(client, model, input)
200+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_upsert')
201+
expect(toolCall, 'Tool vectorize_index_upsert was not called').toBeDefined()
202+
expect(toolCall?.args, 'Arguments did not match').toEqual(
203+
expect.objectContaining({
204+
name: MOCK_INDEX_NAME,
205+
vectors_ndjson: expect.stringContaining('updated vector 1'), // Check if body contains expected data
206+
})
207+
)
208+
return promptOutput
209+
},
210+
scorers: [checkFactuality],
211+
threshold: 1,
212+
timeout: 60000,
213+
})
214+
215+
describeEval('Query Vectors in Index', {
216+
data: async () => [
217+
{
218+
input: `Find the top 5 vectors in index "${MOCK_INDEX_NAME}" closest to this vector: [${MOCK_QUERY_VECTOR.join(', ')}}]. Also return their values.`,
219+
expected: `The vectorize_index_query tool should be called for index "${MOCK_INDEX_NAME}" with the provided query vector, topK=5 and returnValues=true.`,
220+
},
221+
],
222+
task: async (input: string) => {
223+
const client = await initializeClient()
224+
const { promptOutput, toolCalls } = await runTask(client, model, input)
225+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_query')
226+
expect(toolCall, 'Tool vectorize_index_query was not called').toBeDefined()
227+
expect(toolCall?.args, 'Arguments did not match').toEqual(
228+
expect.objectContaining({
229+
name: MOCK_INDEX_NAME,
230+
vector: MOCK_QUERY_VECTOR,
231+
top_k: 5,
232+
return_values: true,
233+
})
234+
)
235+
return promptOutput
236+
},
237+
scorers: [checkFactuality],
238+
threshold: 1,
239+
timeout: 60000,
240+
})
241+
242+
describeEval('Get Vectors by IDs', {
243+
data: async () => [
244+
{
245+
input: `Get vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the "${MOCK_INDEX_NAME}" index.`,
246+
expected: `The vectorize_index_get_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`,
247+
},
248+
],
249+
task: async (input: string) => {
250+
const client = await initializeClient()
251+
const { promptOutput, toolCalls } = await runTask(client, model, input)
252+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get_by_ids')
253+
expect(toolCall, 'Tool vectorize_index_get_by_ids was not called').toBeDefined()
254+
expect(toolCall?.args, 'Arguments did not match').toEqual(
255+
expect.objectContaining({
256+
name: MOCK_INDEX_NAME,
257+
ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]),
258+
})
259+
)
260+
return promptOutput
261+
},
262+
scorers: [checkFactuality],
263+
threshold: 1,
264+
timeout: 60000,
265+
})
266+
267+
describeEval('Delete Vectors by IDs', {
268+
data: async () => [
269+
{
270+
input: `Delete vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the index "${MOCK_INDEX_NAME}".`,
271+
expected: `The vectorize_index_delete_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`,
272+
},
273+
],
274+
task: async (input: string) => {
275+
const client = await initializeClient()
276+
const { promptOutput, toolCalls } = await runTask(client, model, input)
277+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete_by_ids')
278+
expect(toolCall, 'Tool vectorize_index_delete_by_ids was not called').toBeDefined()
279+
expect(toolCall?.args, 'Arguments did not match').toEqual(
280+
expect.objectContaining({
281+
name: MOCK_INDEX_NAME,
282+
ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]),
283+
})
284+
)
285+
return promptOutput
286+
},
287+
scorers: [checkFactuality],
288+
threshold: 1,
289+
timeout: 60000,
290+
})
291+
292+
describeEval('Delete Vectorize Index', {
293+
data: async () => [
294+
{
295+
input: `Delete the Vectorize index named "${MOCK_INDEX_NAME}".`,
296+
expected: `The vectorize_index_delete tool should be called with name "${MOCK_INDEX_NAME}".`,
297+
},
298+
],
299+
task: async (input: string) => {
300+
const client = await initializeClient()
301+
const { promptOutput, toolCalls } = await runTask(client, model, input)
302+
const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete')
303+
expect(toolCall, 'Tool vectorize_index_delete was not called').toBeDefined()
304+
expect(toolCall?.args, 'Arguments did not match').toEqual(
305+
expect.objectContaining({
306+
name: MOCK_INDEX_NAME,
307+
})
308+
)
309+
return promptOutput
310+
},
311+
scorers: [checkFactuality],
312+
threshold: 1,
313+
timeout: 60000,
314+
})
315+
})

0 commit comments

Comments
 (0)