|
| 1 | +import { expect } from 'vitest' |
| 2 | +import { describeEval } from 'vitest-evals' |
| 3 | + |
| 4 | +import { checkFactuality } from '@repo/eval-tools/src/scorers' |
| 5 | +import { eachModel } from '@repo/eval-tools/src/test-models' |
| 6 | + |
| 7 | +import { initializeClient, runTask } from './utils' // Assuming utils.ts will exist here |
| 8 | + |
| 9 | +const MOCK_INDEX_NAME = 'test-vectorize-index' |
| 10 | +const MOCK_INDEX_DESCRIPTION = 'A test index for evaluation' |
| 11 | +const MOCK_DIMENSIONS = 50 |
| 12 | +const MOCK_METRIC = 'cosine' |
| 13 | +const MOCK_PRESET = '@cf/baai/bge-small-en-v1.5' |
| 14 | +const MOCK_VECTOR_ID_1 = 'vec1' |
| 15 | +const MOCK_VECTOR_ID_2 = 'vec2' |
| 16 | +const MOCK_NDJSON_INSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.1,0.2,0.3],"metadata":{"text":"vector 1"}}\n{"id":"${MOCK_VECTOR_ID_2}","values":[0.4,0.5,0.6],"namespace":"ns1"}` |
| 17 | +const MOCK_NDJSON_UPSERT = `{"id":"${MOCK_VECTOR_ID_1}","values":[0.11,0.22,0.33],"metadata":{"text":"updated vector 1"}}` |
| 18 | +const MOCK_QUERY_VECTOR = [0.1, 0.2, 0.3] |
| 19 | + |
| 20 | +eachModel('$modelName', ({ model }) => { |
| 21 | + describeEval('Create Vectorize Index (Dimensions/Metric)', { |
| 22 | + data: async () => [ |
| 23 | + { |
| 24 | + input: `Create a Vectorize index named "${MOCK_INDEX_NAME}" with ${MOCK_DIMENSIONS} dimensions using the "${MOCK_METRIC}" metric. Add description: "${MOCK_INDEX_DESCRIPTION}".`, |
| 25 | + expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}", config specifying ${MOCK_DIMENSIONS} dimensions and "${MOCK_METRIC}" metric, and description "${MOCK_INDEX_DESCRIPTION}".`, |
| 26 | + }, |
| 27 | + ], |
| 28 | + task: async (input: string) => { |
| 29 | + const client = await initializeClient() |
| 30 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 31 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create') |
| 32 | + expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined() |
| 33 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 34 | + expect.objectContaining({ |
| 35 | + name: MOCK_INDEX_NAME, |
| 36 | + config: expect.objectContaining({ |
| 37 | + dimensions: MOCK_DIMENSIONS, |
| 38 | + metric: MOCK_METRIC, |
| 39 | + }), |
| 40 | + description: MOCK_INDEX_DESCRIPTION, |
| 41 | + }) |
| 42 | + ) |
| 43 | + return promptOutput |
| 44 | + }, |
| 45 | + scorers: [checkFactuality], |
| 46 | + threshold: 1, |
| 47 | + timeout: 60000, |
| 48 | + }) |
| 49 | + |
| 50 | + // --- Test vectorize_index_create (with preset) --- |
| 51 | + describeEval('Create Vectorize Index (Preset)', { |
| 52 | + data: async () => [ |
| 53 | + { |
| 54 | + input: `Create a Vectorize index named "${MOCK_INDEX_NAME}-preset" using the "${MOCK_PRESET}" preset.`, |
| 55 | + expected: `The vectorize_index_create tool should be called with name "${MOCK_INDEX_NAME}-preset" and config specifying the preset "${MOCK_PRESET}".`, |
| 56 | + }, |
| 57 | + ], |
| 58 | + task: async (input: string) => { |
| 59 | + const client = await initializeClient() |
| 60 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 61 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_create') |
| 62 | + expect(toolCall, 'Tool vectorize_index_create was not called').toBeDefined() |
| 63 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 64 | + expect.objectContaining({ |
| 65 | + name: `${MOCK_INDEX_NAME}-preset`, |
| 66 | + config: expect.objectContaining({ |
| 67 | + preset: MOCK_PRESET, |
| 68 | + }), |
| 69 | + }) |
| 70 | + ) |
| 71 | + return promptOutput |
| 72 | + }, |
| 73 | + scorers: [checkFactuality], |
| 74 | + threshold: 1, |
| 75 | + timeout: 60000, |
| 76 | + }) |
| 77 | + |
| 78 | + // --- Test vectorize_index_list --- |
| 79 | + describeEval('List Vectorize Indexes', { |
| 80 | + data: async () => [ |
| 81 | + { |
| 82 | + input: 'List my Vectorize indexes.', |
| 83 | + expected: 'The vectorize_index_list tool should be called.', |
| 84 | + }, |
| 85 | + { |
| 86 | + input: 'Show me page 2 of my Vectorize indexes, 10 per page, ordered by name descending.', |
| 87 | + expected: |
| 88 | + 'The vectorize_index_list tool should be called with page 2, per_page 10, order name, direction desc.', |
| 89 | + }, |
| 90 | + ], |
| 91 | + task: async (input: string) => { |
| 92 | + const client = await initializeClient() |
| 93 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 94 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_list') |
| 95 | + expect(toolCall, 'Tool vectorize_index_list was not called').toBeDefined() |
| 96 | + |
| 97 | + // Check specific args only for the pagination case |
| 98 | + if (input.includes('page 2')) { |
| 99 | + expect(toolCall?.args, 'Pagination arguments did not match').toEqual( |
| 100 | + expect.objectContaining({ |
| 101 | + page: 2, |
| 102 | + per_page: 10, |
| 103 | + order: 'name', |
| 104 | + direction: 'desc', |
| 105 | + }) |
| 106 | + ) |
| 107 | + } |
| 108 | + |
| 109 | + return promptOutput |
| 110 | + }, |
| 111 | + scorers: [checkFactuality], |
| 112 | + threshold: 1, |
| 113 | + timeout: 60000, |
| 114 | + }) |
| 115 | + |
| 116 | + // --- Test vectorize_index_get --- |
| 117 | + describeEval('Get Vectorize Index Details', { |
| 118 | + data: async () => [ |
| 119 | + { |
| 120 | + input: `Get the details for the Vectorize index named "${MOCK_INDEX_NAME}".`, |
| 121 | + expected: `The vectorize_index_get tool should be called with name "${MOCK_INDEX_NAME}".`, |
| 122 | + }, |
| 123 | + ], |
| 124 | + task: async (input: string) => { |
| 125 | + const client = await initializeClient() |
| 126 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 127 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get') |
| 128 | + expect(toolCall, 'Tool vectorize_index_get was not called').toBeDefined() |
| 129 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 130 | + expect.objectContaining({ |
| 131 | + name: MOCK_INDEX_NAME, |
| 132 | + }) |
| 133 | + ) |
| 134 | + return promptOutput |
| 135 | + }, |
| 136 | + scorers: [checkFactuality], |
| 137 | + threshold: 1, |
| 138 | + timeout: 60000, |
| 139 | + }) |
| 140 | + |
| 141 | + describeEval('Get Vectorize Index Info', { |
| 142 | + data: async () => [ |
| 143 | + { |
| 144 | + input: `Get operational info for the Vectorize index "${MOCK_INDEX_NAME}".`, |
| 145 | + expected: `The vectorize_index_info tool should be called with name "${MOCK_INDEX_NAME}".`, |
| 146 | + }, |
| 147 | + ], |
| 148 | + task: async (input: string) => { |
| 149 | + const client = await initializeClient() |
| 150 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 151 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_info') |
| 152 | + expect(toolCall, 'Tool vectorize_index_info was not called').toBeDefined() |
| 153 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 154 | + expect.objectContaining({ |
| 155 | + name: MOCK_INDEX_NAME, |
| 156 | + }) |
| 157 | + ) |
| 158 | + return promptOutput |
| 159 | + }, |
| 160 | + scorers: [checkFactuality], |
| 161 | + threshold: 1, |
| 162 | + timeout: 60000, |
| 163 | + }) |
| 164 | + |
| 165 | + describeEval('Insert Vectors into Index', { |
| 166 | + data: async () => [ |
| 167 | + { |
| 168 | + input: `Insert the following vectors into the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_INSERT}`, |
| 169 | + expected: `The vectorize_index_insert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data.`, |
| 170 | + }, |
| 171 | + ], |
| 172 | + task: async (input: string) => { |
| 173 | + const client = await initializeClient() |
| 174 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 175 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_insert') |
| 176 | + expect(toolCall, 'Tool vectorize_index_insert was not called').toBeDefined() |
| 177 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 178 | + expect.objectContaining({ |
| 179 | + name: MOCK_INDEX_NAME, |
| 180 | + vectors_ndjson: expect.stringContaining(MOCK_VECTOR_ID_1), // Check if body contains expected data |
| 181 | + }) |
| 182 | + ) |
| 183 | + return promptOutput |
| 184 | + }, |
| 185 | + scorers: [checkFactuality], |
| 186 | + threshold: 1, |
| 187 | + timeout: 60000, |
| 188 | + }) |
| 189 | + |
| 190 | + describeEval('Upsert Vectors into Index', { |
| 191 | + data: async () => [ |
| 192 | + { |
| 193 | + input: `Upsert these vectors in the "${MOCK_INDEX_NAME}" index:\n${MOCK_NDJSON_UPSERT}`, |
| 194 | + expected: `The vectorize_index_upsert tool should be called for index "${MOCK_INDEX_NAME}" with the provided NDJSON data for upserting.`, |
| 195 | + }, |
| 196 | + ], |
| 197 | + task: async (input: string) => { |
| 198 | + const client = await initializeClient() |
| 199 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 200 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_upsert') |
| 201 | + expect(toolCall, 'Tool vectorize_index_upsert was not called').toBeDefined() |
| 202 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 203 | + expect.objectContaining({ |
| 204 | + name: MOCK_INDEX_NAME, |
| 205 | + vectors_ndjson: expect.stringContaining('updated vector 1'), // Check if body contains expected data |
| 206 | + }) |
| 207 | + ) |
| 208 | + return promptOutput |
| 209 | + }, |
| 210 | + scorers: [checkFactuality], |
| 211 | + threshold: 1, |
| 212 | + timeout: 60000, |
| 213 | + }) |
| 214 | + |
| 215 | + describeEval('Query Vectors in Index', { |
| 216 | + data: async () => [ |
| 217 | + { |
| 218 | + input: `Find the top 5 vectors in index "${MOCK_INDEX_NAME}" closest to this vector: [${MOCK_QUERY_VECTOR.join(', ')}}]. Also return their values.`, |
| 219 | + expected: `The vectorize_index_query tool should be called for index "${MOCK_INDEX_NAME}" with the provided query vector, topK=5 and returnValues=true.`, |
| 220 | + }, |
| 221 | + ], |
| 222 | + task: async (input: string) => { |
| 223 | + const client = await initializeClient() |
| 224 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 225 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_query') |
| 226 | + expect(toolCall, 'Tool vectorize_index_query was not called').toBeDefined() |
| 227 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 228 | + expect.objectContaining({ |
| 229 | + name: MOCK_INDEX_NAME, |
| 230 | + vector: MOCK_QUERY_VECTOR, |
| 231 | + top_k: 5, |
| 232 | + return_values: true, |
| 233 | + }) |
| 234 | + ) |
| 235 | + return promptOutput |
| 236 | + }, |
| 237 | + scorers: [checkFactuality], |
| 238 | + threshold: 1, |
| 239 | + timeout: 60000, |
| 240 | + }) |
| 241 | + |
| 242 | + describeEval('Get Vectors by IDs', { |
| 243 | + data: async () => [ |
| 244 | + { |
| 245 | + input: `Get vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the "${MOCK_INDEX_NAME}" index.`, |
| 246 | + expected: `The vectorize_index_get_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`, |
| 247 | + }, |
| 248 | + ], |
| 249 | + task: async (input: string) => { |
| 250 | + const client = await initializeClient() |
| 251 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 252 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_get_by_ids') |
| 253 | + expect(toolCall, 'Tool vectorize_index_get_by_ids was not called').toBeDefined() |
| 254 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 255 | + expect.objectContaining({ |
| 256 | + name: MOCK_INDEX_NAME, |
| 257 | + ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]), |
| 258 | + }) |
| 259 | + ) |
| 260 | + return promptOutput |
| 261 | + }, |
| 262 | + scorers: [checkFactuality], |
| 263 | + threshold: 1, |
| 264 | + timeout: 60000, |
| 265 | + }) |
| 266 | + |
| 267 | + describeEval('Delete Vectors by IDs', { |
| 268 | + data: async () => [ |
| 269 | + { |
| 270 | + input: `Delete vectors with IDs "${MOCK_VECTOR_ID_1}" and "${MOCK_VECTOR_ID_2}" from the index "${MOCK_INDEX_NAME}".`, |
| 271 | + expected: `The vectorize_index_delete_by_ids tool should be called for index "${MOCK_INDEX_NAME}" with IDs ["${MOCK_VECTOR_ID_1}", "${MOCK_VECTOR_ID_2}"].`, |
| 272 | + }, |
| 273 | + ], |
| 274 | + task: async (input: string) => { |
| 275 | + const client = await initializeClient() |
| 276 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 277 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete_by_ids') |
| 278 | + expect(toolCall, 'Tool vectorize_index_delete_by_ids was not called').toBeDefined() |
| 279 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 280 | + expect.objectContaining({ |
| 281 | + name: MOCK_INDEX_NAME, |
| 282 | + ids: expect.arrayContaining([MOCK_VECTOR_ID_1, MOCK_VECTOR_ID_2]), |
| 283 | + }) |
| 284 | + ) |
| 285 | + return promptOutput |
| 286 | + }, |
| 287 | + scorers: [checkFactuality], |
| 288 | + threshold: 1, |
| 289 | + timeout: 60000, |
| 290 | + }) |
| 291 | + |
| 292 | + describeEval('Delete Vectorize Index', { |
| 293 | + data: async () => [ |
| 294 | + { |
| 295 | + input: `Delete the Vectorize index named "${MOCK_INDEX_NAME}".`, |
| 296 | + expected: `The vectorize_index_delete tool should be called with name "${MOCK_INDEX_NAME}".`, |
| 297 | + }, |
| 298 | + ], |
| 299 | + task: async (input: string) => { |
| 300 | + const client = await initializeClient() |
| 301 | + const { promptOutput, toolCalls } = await runTask(client, model, input) |
| 302 | + const toolCall = toolCalls.find((call) => call.toolName === 'vectorize_index_delete') |
| 303 | + expect(toolCall, 'Tool vectorize_index_delete was not called').toBeDefined() |
| 304 | + expect(toolCall?.args, 'Arguments did not match').toEqual( |
| 305 | + expect.objectContaining({ |
| 306 | + name: MOCK_INDEX_NAME, |
| 307 | + }) |
| 308 | + ) |
| 309 | + return promptOutput |
| 310 | + }, |
| 311 | + scorers: [checkFactuality], |
| 312 | + threshold: 1, |
| 313 | + timeout: 60000, |
| 314 | + }) |
| 315 | +}) |
0 commit comments