|
| 1 | +import repl from 'node:repl'; |
| 2 | +import fs from 'node:fs/promises'; |
| 3 | +import { VoyageAIClient } from 'voyageai'; // Adjust import as needed |
| 4 | +import { MongoClient, Binary, BSON } from 'mongodb'; |
| 5 | +import http from 'node:http'; |
| 6 | +import { GridFSBucket } from 'mongodb'; |
| 7 | + |
| 8 | +// COMMAND: |
| 9 | +// node --no-warnings --env-file=.env --experimental-strip-types prompt.mts |
| 10 | +/** |
| 11 | + * Interactive REPL tool for searching movie reviews and posters using MongoDB vector search and VoyageAI embeddings. |
| 12 | + * |
| 13 | + * Features: |
| 14 | + * - Connects to a MongoDB database with collections for movie reviews and poster images. |
| 15 | + * - Uses VoyageAI to generate vector embeddings for text and multimodal queries. |
| 16 | + * - Caches query vectors in a local `queries.jsonl` file to avoid redundant API calls. |
| 17 | + * - Provides two main REPL commands: |
| 18 | + * - `r(query: string)`: Search for movie reviews semantically similar to the query. |
| 19 | + * - `i(query: string)`: Search for movie posters semantically similar to the query and display them in a local web server. |
| 20 | + * - Ensures required MongoDB vector search indexes exist and are queryable. |
| 21 | + * - Serves a simple HTML page at http://localhost:2390 to display poster search results. |
| 22 | + * |
| 23 | + * Environment Variables: |
| 24 | + * - `MONGODB_URI`: MongoDB connection string. |
| 25 | + * - `VOYAGE_API_KEY`: API key for VoyageAI. |
| 26 | + * |
| 27 | + * @module prompt |
| 28 | + */ |
| 29 | + |
| 30 | +/** |
| 31 | + * HOW TO RUN: |
| 32 | + * |
| 33 | + * 1. Ensure you have a `.env` file with the following variables: |
| 34 | + * MONGODB_URI=<your-mongodb-uri> |
| 35 | + * VOYAGE_API_KEY=<your-voyageai-api-key> |
| 36 | + * |
| 37 | + * 2. Install dependencies: |
| 38 | + * npm install mongodb voyageai |
| 39 | + * |
| 40 | + * 3. Run the script with Node.js (v20+ recommended): |
| 41 | + * node --no-warnings --env-file=.env --experimental-strip-types prompt.mts |
| 42 | + * |
| 43 | + * 4. In the REPL: |
| 44 | + * - Use `await r('your query')` to search reviews. |
| 45 | + * - Use `await i('your query')` to search posters (see results at http://localhost:2390). |
| 46 | + * |
| 47 | + * 5. All queries and their vectors are cached in `queries.jsonl`. |
| 48 | + */ |
| 49 | + |
| 50 | +const EJSON = BSON.EJSON; |
| 51 | + |
| 52 | +// Replace with your actual connection strings and API keys |
| 53 | +const MONGODB_URI = process.env.MONGODB_URI; |
| 54 | +if (!MONGODB_URI) throw new Error('you must provide MONGODB_URI in the env'); |
| 55 | +const VOYAGE_API_KEY = process.env.VOYAGE_API_KEY; |
| 56 | +if (!VOYAGE_API_KEY) |
| 57 | + throw new Error('you must provide VOYAGE_API_KEY in the env'); |
| 58 | + |
| 59 | +const voyage = new VoyageAIClient({ apiKey: VOYAGE_API_KEY }); |
| 60 | +const mongoClient = new MongoClient(MONGODB_URI); |
| 61 | +await mongoClient.connect(); |
| 62 | + |
| 63 | +const movies = mongoClient.db('movies'); |
| 64 | +const bucket = new GridFSBucket(movies); |
| 65 | +const imageFiles = movies.collection('fs.files'); |
| 66 | +const reviews = movies.collection('reviews'); |
| 67 | + |
| 68 | +const queriesFile = await fs.open('queries.jsonl', 'a'); |
| 69 | + |
| 70 | +console.log('\n'); |
| 71 | + |
| 72 | +async function ensureIndexes() { |
| 73 | + function indexDefFor( |
| 74 | + name: string |
| 75 | + ): ( |
| 76 | + index: { name: string; queryable: boolean }, |
| 77 | + _: number, |
| 78 | + __: any |
| 79 | + ) => boolean { |
| 80 | + return (idx) => idx.name === name && idx.queryable; |
| 81 | + } |
| 82 | + |
| 83 | + let reviewSearchIndexes = (await reviews |
| 84 | + .listSearchIndexes() |
| 85 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 86 | + let imageSearchIndexes = (await imageFiles |
| 87 | + .listSearchIndexes() |
| 88 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 89 | + |
| 90 | + let tries = 0; |
| 91 | + while ( |
| 92 | + (!reviewSearchIndexes.some(indexDefFor('real_for_real_index')) || |
| 93 | + !imageSearchIndexes.some(indexDefFor('poster_vec_index'))) && |
| 94 | + tries < 3 |
| 95 | + ) { |
| 96 | + console.log(`Waiting for indexes to appear (try ${tries + 1})...`); |
| 97 | + await new Promise((res) => setTimeout(res, 2000)); |
| 98 | + reviewSearchIndexes = (await reviews |
| 99 | + .listSearchIndexes() |
| 100 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 101 | + imageSearchIndexes = (await imageFiles |
| 102 | + .listSearchIndexes() |
| 103 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 104 | + tries++; |
| 105 | + } |
| 106 | + |
| 107 | + if (!reviewSearchIndexes.some(indexDefFor('real_for_real_index'))) { |
| 108 | + console.log('Creating review vector search index: real_for_real_index'); |
| 109 | + await reviews.createSearchIndex({ |
| 110 | + name: 'real_for_real_index', |
| 111 | + type: 'vectorSearch', |
| 112 | + definition: { |
| 113 | + fields: [ |
| 114 | + { |
| 115 | + type: 'vector', |
| 116 | + path: 'review_vec', |
| 117 | + numDimensions: 1024, |
| 118 | + similarity: 'dotProduct', |
| 119 | + }, |
| 120 | + ], |
| 121 | + }, |
| 122 | + }); |
| 123 | + } else { |
| 124 | + console.log( |
| 125 | + 'Review vector search index already exists and is queryable: real_for_real_index' |
| 126 | + ); |
| 127 | + } |
| 128 | + |
| 129 | + if (!imageSearchIndexes.some(indexDefFor('poster_vec_index'))) { |
| 130 | + console.log('Creating poster vector search index: poster_vec_index'); |
| 131 | + await imageFiles.createSearchIndex({ |
| 132 | + name: 'poster_vec_index', |
| 133 | + type: 'vectorSearch', |
| 134 | + definition: { |
| 135 | + fields: [ |
| 136 | + { |
| 137 | + type: 'vector', |
| 138 | + path: 'metadata.vector', |
| 139 | + numDimensions: 1024, |
| 140 | + similarity: 'dotProduct', |
| 141 | + }, |
| 142 | + ], |
| 143 | + }, |
| 144 | + }); |
| 145 | + } else { |
| 146 | + console.log( |
| 147 | + 'Poster vector search index already exists and is queryable: poster_vec_index' |
| 148 | + ); |
| 149 | + } |
| 150 | + |
| 151 | + while ( |
| 152 | + !reviewSearchIndexes.some(indexDefFor('real_for_real_index')) || |
| 153 | + !imageSearchIndexes.some(indexDefFor('poster_vec_index')) |
| 154 | + ) { |
| 155 | + console.log('Waiting for all indexes to be ready and queryable...'); |
| 156 | + await new Promise((res) => setTimeout(res, 2000)); |
| 157 | + reviewSearchIndexes = (await reviews |
| 158 | + .listSearchIndexes() |
| 159 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 160 | + imageSearchIndexes = (await imageFiles |
| 161 | + .listSearchIndexes() |
| 162 | + .toArray()) as unknown as { name: string; queryable: boolean }[]; |
| 163 | + } |
| 164 | + console.log('All required indexes are ready and queryable.'); |
| 165 | +} |
| 166 | + |
| 167 | +await ensureIndexes(); |
| 168 | + |
| 169 | +async function getCachedVector(query: string): Promise<Binary | null> { |
| 170 | + // Read the file from the beginning using fs.readFile |
| 171 | + const content = await fs.readFile('queries.jsonl', { encoding: 'utf8' }); |
| 172 | + const lines = content.split('\n').filter((l) => Boolean(l)); |
| 173 | + for (const line of lines) { |
| 174 | + try { |
| 175 | + const entry = EJSON.parse(line, { relaxed: false }); |
| 176 | + if (entry.query === query && entry.vector) { |
| 177 | + // entry.vector is EJSON, convert to Binary |
| 178 | + return entry.vector; |
| 179 | + } |
| 180 | + } catch { |
| 181 | + /* ignore parse errors */ |
| 182 | + } |
| 183 | + } |
| 184 | + return null; |
| 185 | +} |
| 186 | + |
| 187 | +async function r(query: string) { |
| 188 | + await mongoClient.connect(); |
| 189 | + |
| 190 | + // Try to get vector from cache |
| 191 | + let vector = await getCachedVector(query); |
| 192 | + |
| 193 | + if (vector == null) { |
| 194 | + // Get vector from VoyageAI |
| 195 | + const response = await voyage.embed({ |
| 196 | + model: 'voyage-3-large', |
| 197 | + input: query, |
| 198 | + }); |
| 199 | + |
| 200 | + vector = Binary.fromFloat32Array( |
| 201 | + new Float32Array(response.data[0].embedding) |
| 202 | + ); |
| 203 | + await queriesFile.appendFile(EJSON.stringify({ query, vector }) + '\n'); |
| 204 | + } |
| 205 | + |
| 206 | + // Run vector search aggregation |
| 207 | + const pipeline = [ |
| 208 | + { |
| 209 | + $vectorSearch: { |
| 210 | + index: 'real_for_real_index', |
| 211 | + path: 'review_vec', |
| 212 | + queryVector: vector, |
| 213 | + numCandidates: replInstance.context.numCandidates, |
| 214 | + limit: replInstance.context.limit, |
| 215 | + }, |
| 216 | + }, |
| 217 | + { |
| 218 | + $project: { |
| 219 | + _id: 0, |
| 220 | + review: 1, |
| 221 | + rating: 1, |
| 222 | + score: { $meta: 'vectorSearchScore' }, |
| 223 | + }, |
| 224 | + }, |
| 225 | + ]; |
| 226 | + |
| 227 | + const results = await reviews.aggregate(pipeline).toArray(); |
| 228 | + return results; |
| 229 | +} |
| 230 | + |
| 231 | +let html = (content) => ` |
| 232 | + <!DOCTYPE html> |
| 233 | + <html lang="en"> |
| 234 | + <meta charset="UTF-8"> |
| 235 | + <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 236 | + <body>${content}</body> |
| 237 | + </html> |
| 238 | +`; |
| 239 | + |
| 240 | +let lastHtml = html('<h2>No posters searched yet.</h2>'); |
| 241 | +let server: http.Server | null = null; |
| 242 | + |
| 243 | +// Start HTTP server |
| 244 | +server = http.createServer((req, res) => { |
| 245 | + res.writeHead(200, { 'Content-Type': 'text/html' }); |
| 246 | + res.end(lastHtml); |
| 247 | +}); |
| 248 | +console.log('Images: http://localhost:2390'); |
| 249 | +server.listen(2390, 'localhost', () => {}); |
| 250 | + |
| 251 | +async function i(query: string) { |
| 252 | + await mongoClient.connect(); |
| 253 | + |
| 254 | + // Try to get vector from cache |
| 255 | + let vector = await getCachedVector(query); |
| 256 | + |
| 257 | + if (vector == null) { |
| 258 | + const inputs = [{ content: [{ type: 'text', text: query }] }]; |
| 259 | + |
| 260 | + const response = await voyage.multimodalEmbed({ |
| 261 | + inputs, |
| 262 | + model: 'voyage-multimodal-3', |
| 263 | + }); |
| 264 | + vector = Binary.fromFloat32Array( |
| 265 | + new Float32Array(response.data[0].embedding) |
| 266 | + ); |
| 267 | + await queriesFile.appendFile(EJSON.stringify({ query, vector }) + '\n'); |
| 268 | + } |
| 269 | + |
| 270 | + // Vector search for poster files (assume 'poster_vec' field in files collection) |
| 271 | + const filesCollection = movies.collection('fs.files'); |
| 272 | + const pipeline = [ |
| 273 | + { |
| 274 | + $vectorSearch: { |
| 275 | + index: 'poster_vec_index', |
| 276 | + path: 'metadata.vector', |
| 277 | + queryVector: vector, |
| 278 | + numCandidates: replInstance.context.numCandidates, |
| 279 | + limit: replInstance.context.limit, |
| 280 | + }, |
| 281 | + }, |
| 282 | + { |
| 283 | + $project: { |
| 284 | + _id: 1, |
| 285 | + filename: 1, |
| 286 | + score: { $meta: 'vectorSearchScore' }, |
| 287 | + }, |
| 288 | + }, |
| 289 | + ]; |
| 290 | + const files = await filesCollection.aggregate(pipeline).toArray(); |
| 291 | + |
| 292 | + // Fetch image data from GridFS and build HTML |
| 293 | + let imageHtml = '<h2>Posters for: ' + query + '</h2>'; |
| 294 | + for (const file of files) { |
| 295 | + try { |
| 296 | + const downloadStream = bucket.openDownloadStream(file._id); |
| 297 | + const chunks: Buffer[] = []; |
| 298 | + for await (const chunk of downloadStream) { |
| 299 | + chunks.push(chunk as Buffer); |
| 300 | + } |
| 301 | + const buffer = Buffer.concat(chunks); |
| 302 | + const base64 = buffer.toString('base64'); |
| 303 | + imageHtml += `<div style="display:inline-block;margin:8px;"><img src="data:image/jpeg;base64,${base64}" style="max-width:200px;max-height:300px;"/><br>${file.filename}</div>`; |
| 304 | + } catch (e) { |
| 305 | + imageHtml += `<div>Error loading poster: ${file.filename}</div>`; |
| 306 | + } |
| 307 | + } |
| 308 | + lastHtml = html(imageHtml); |
| 309 | + return files.map((f) => f.filename); |
| 310 | +} |
| 311 | + |
| 312 | +console.log('Use `r(string)` to search for reviews.'); |
| 313 | +console.log('Use `i(string)` to search posters.'); |
| 314 | +console.log('All queries are saved to queries.jsonl'); |
| 315 | +console.log('\n'); |
| 316 | + |
| 317 | +const replInstance = repl.start({ prompt: 'niva_neal ✨ ' }); |
| 318 | +replInstance.context.r = r; |
| 319 | +replInstance.context.i = i; |
| 320 | +replInstance.context.numCandidates = 100; |
| 321 | +replInstance.context.limit = 5; |
| 322 | + |
| 323 | +replInstance.on('exit', async () => { |
| 324 | + console.log('bye bye 👋'); |
| 325 | + await mongoClient.close(); |
| 326 | + await queriesFile.close(); |
| 327 | + server?.close(); |
| 328 | + replInstance.close(); |
| 329 | + process.exit(); |
| 330 | +}); |
0 commit comments