cloudflare
diff --git a/‎apps/workers-bindings/evals/kv_namespaces.eval.ts‎
Lines changed: 9 additions & 10 deletions b/‎apps/workers-bindings/evals/kv_namespaces.eval.ts‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎apps/workers-bindings/evals/vectorize.eval.ts‎
Lines changed: 315 additions & 0 deletions b/‎apps/workers-bindings/evals/vectorize.eval.ts‎
Lines changed: 315 additions & 0 deletions
@@ -25,7 +25,7 @@ eachModel('$modelName', ({ model }) => {
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
-		timeout: 60000, // 60 seconds
+		timeout: 60000,
 	})
 	describeEval('List Cloudflare KV Namespaces', {
 		data: async () => [
@@ -37,7 +37,7 @@ eachModel('$modelName', ({ model }) => {
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
-			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
 
 			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespaces_list')
 			expect(toolCall, 'Tool kv_namespaces_list was not called').toBeDefined()
@@ -46,7 +46,7 @@ eachModel('$modelName', ({ model }) => {
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
-		timeout: 60000, // 60 seconds
+		timeout: 60000,
 	})
 	describeEval('Rename Cloudflare KV Namespace', {
 		data: async () => [
@@ -58,7 +58,7 @@ eachModel('$modelName', ({ model }) => {
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
-			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
 
 			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_update')
 			expect(toolCall, 'Tool kv_namespace_update was not called').toBeDefined()
@@ -67,7 +67,7 @@ eachModel('$modelName', ({ model }) => {
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
-		timeout: 60000, // 60 seconds
+		timeout: 60000,
 	})
 	describeEval('Get Cloudflare KV Namespace Details', {
 		data: async () => [
@@ -79,17 +79,16 @@ eachModel('$modelName', ({ model }) => {
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
-			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
 
-			console.log('fullResult', JSON.stringify(await fullResult.response, null, 2))
 			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_get')
 			expect(toolCall, 'Tool kv_namespace_get was not called').toBeDefined()
 
 			return promptOutput
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
-		timeout: 60000, // 60 seconds
+		timeout: 60000,
 	})
 	describeEval('Delete Cloudflare KV Namespace', {
 		data: async () => [
@@ -100,7 +99,7 @@ eachModel('$modelName', ({ model }) => {
 		],
 		task: async (input: string) => {
 			const client = await initializeClient(/* Pass necessary mocks/config */)
-			const { promptOutput, toolCalls, fullResult } = await runTask(client, model, input)
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
 
 			const toolCall = toolCalls.find((call) => call.toolName === 'kv_namespace_delete')
 			expect(toolCall, 'Tool kv_namespace_delete was not called').toBeDefined()
@@ -109,6 +108,6 @@ eachModel('$modelName', ({ model }) => {
 		},
 		scorers: [checkFactuality],
 		threshold: 1,
-		timeout: 60000, // 60 seconds
+		timeout: 60000,
 	})
 })
@@ -0,0 +1,315 @@
+import { expect } from 'vitest'
+import { describeEval } from 'vitest-evals'
+
+import { checkFactuality } from '@repo/eval-tools/src/scorers'
+import { eachModel } from '@repo/eval-tools/src/test-models'
+
+import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here
+
+const MOCK_INDEX_NAME = 'test-vectorize-index'
+const MOCK_INDEX_DESCRIPTION = 'A test index for evaluation'
+const MOCK_DIMENSIONS = 50
+const MOCK_METRIC = 'cosine'
+const MOCK_PRESET = '@cf/baai/bge-small-en-v1.5'
+const MOCK_VECTOR_ID_1 = 'vec1'
+const MOCK_VECTOR_ID_2 = 'vec2'
+const MOCK_NDJSON_INSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.1,0.2,0.3],"metadata":{"text":"vector 1"}}\n{"id":"${MOCK_VECTOR_ID_2}","values":[0.4,0.5,0.6],"namespace":"ns1"}`
+const MOCK_NDJSON_UPSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.11,0.22,0.33],"metadata":{"text":"updated vector 1"}}`
+const MOCK_QUERY_VECTOR = [0.1, 0.2, 0.3]
+
+eachModel('$modelName', ({ model }) => {
+	describeEval('Create Vectorize Index (Dimensions/Metric)', {
+		data: async () => [
+			{
+				input: `Create a Vectorize index named "${MOCK_INDEX_NAME}" with ${MOCK_DIMENSIONS} dimensions using the "${MOCK_METRIC}" metric. Add description: "${MOCK_INDEX_DESCRIPTION}".`,
+				expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}", config specifying ${MOCK_DIMENSIONS} dimensions and "${MOCK_METRIC}" metric, and description "${MOCK_INDEX_DESCRIPTION}".`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create')
+			expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					config: expect.objectContaining({
+						dimensions: MOCK_DIMENSIONS,
+						metric: MOCK_METRIC,
+					}),
+					description: MOCK_INDEX_DESCRIPTION,
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	// --- Test vectorize_index_create (with preset) ---
+	describeEval('Create Vectorize Index (Preset)', {
+		data: async () => [
+			{
+				input: `Create a Vectorize index named "${MOCK_INDEX_NAME}-preset" using the "${MOCK_PRESET}" preset.`,
+				expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}-preset" and config specifying the preset "${MOCK_PRESET}".`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create')
+			expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: `${MOCK_INDEX_NAME}-preset`,
+					config: expect.objectContaining({
+						preset: MOCK_PRESET,
+					}),
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	// --- Test vectorize_index_list ---
+	describeEval('List Vectorize Indexes', {
+		data: async () => [
+			{
+				input: 'List my Vectorize indexes.',
+				expected: 'The vectorize_index_list tool should be called.',
+			},
+			{
+				input: 'Show me page 2 of my Vectorize indexes, 10 per page, ordered by name descending.',
+				expected:
+					'The vectorize_index_list tool should be called with page 2, per_page 10, order name, direction desc.',
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_list')
+			expect(toolCall, 'Tool vectorize_index_list was not called').toBeDefined()
+
+			// Check specific args only for the pagination case
+			if (input.includes('page 2')) {
+				expect(toolCall?.args, 'Pagination arguments did not match').toEqual(
+					expect.objectContaining({
+						page: 2,
+						per_page: 10,
+						order: 'name',
+						direction: 'desc',
+					})
+				)
+			}
+
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	// --- Test vectorize_index_get ---
+	describeEval('Get Vectorize Index Details', {
+		data: async () => [
+			{
+				input: `Get the details for the Vectorize index named "${MOCK_INDEX_NAME}".`,
+				expected: `The vectorize_index_get tool should be called with name "${MOCK_INDEX_NAME}".`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get')
+			expect(toolCall, 'Tool vectorize_index_get was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Get Vectorize Index Info', {
+		data: async () => [
+			{
+				input: `Get operational info for the Vectorize index "${MOCK_INDEX_NAME}".`,
+				expected: `The vectorize_index_info tool should be called with name "${MOCK_INDEX_NAME}".`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_info')
+			expect(toolCall, 'Tool vectorize_index_info was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Insert Vectors into Index', {
+		data: async () => [
+			{
+				input: `Insert the following vectors into the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_INSERT}`,
+				expected: `The vectorize_index_insert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data.`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_insert')
+			expect(toolCall, 'Tool vectorize_index_insert was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					vectors_ndjson: expect.stringContaining(MOCK_VECTOR_ID_1), // Check if body contains expected data
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Upsert Vectors into Index', {
+		data: async () => [
+			{
+				input: `Upsert these vectors in the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_UPSERT}`,
+				expected: `The vectorize_index_upsert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data for upserting.`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_upsert')
+			expect(toolCall, 'Tool vectorize_index_upsert was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					vectors_ndjson: expect.stringContaining('updated vector 1'), // Check if body contains expected data
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Query Vectors in Index', {
+		data: async () => [
+			{
+				input: `Find the top 5 vectors in index "${MOCK_INDEX_NAME}" closest to this vector: [${MOCK_QUERY_VECTOR.join(', ')}}]. Also return their values.`,
+				expected: `The vectorize_index_query tool should be called for index "${MOCK_INDEX_NAME}" with the provided query vector, topK=5 and returnValues=true.`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_query')
+			expect(toolCall, 'Tool vectorize_index_query was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					vector: MOCK_QUERY_VECTOR,
+					top_k: 5,
+					return_values: true,
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Get Vectors by IDs', {
+		data: async () => [
+			{
+				input: `Get vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the "${MOCK_INDEX_NAME}" index.`,
+				expected: `The vectorize_index_get_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get_by_ids')
+			expect(toolCall, 'Tool vectorize_index_get_by_ids was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]),
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Delete Vectors by IDs', {
+		data: async () => [
+			{
+				input: `Delete vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the index "${MOCK_INDEX_NAME}".`,
+				expected: `The vectorize_index_delete_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete_by_ids')
+			expect(toolCall, 'Tool vectorize_index_delete_by_ids was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+					ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]),
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+
+	describeEval('Delete Vectorize Index', {
+		data: async () => [
+			{
+				input: `Delete the Vectorize index named "${MOCK_INDEX_NAME}".`,
+				expected: `The vectorize_index_delete tool should be called with name "${MOCK_INDEX_NAME}".`,
+			},
+		],
+		task: async (input: string) => {
+			const client = await initializeClient()
+			const { promptOutput, toolCalls } = await runTask(client, model, input)
+			const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete')
+			expect(toolCall, 'Tool vectorize_index_delete was not called').toBeDefined()
+			expect(toolCall?.args, 'Arguments did not match').toEqual(
+				expect.objectContaining({
+					name: MOCK_INDEX_NAME,
+				})
+			)
+			return promptOutput
+		},
+		scorers: [checkFactuality],
+		threshold: 1,
+		timeout: 60000,
+	})
+})