Skip to content

Commit 83250c1

Browse files
committed
chore: add prompt helper
1 parent a906bff commit 83250c1

File tree

3 files changed

+343
-0
lines changed

3 files changed

+343
-0
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.env
2+
queries.jsonl
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"name": "neal_niva_prompt",
3+
"private": true,
4+
"scripts": {
5+
"start": "node --no-warnings --env-file=.env --experimental-strip-types prompt.mts"
6+
},
7+
"dependencies": {
8+
"mongodb": "^6.16.0",
9+
"voyageai": "^0.0.4"
10+
}
11+
}
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
import repl from 'node:repl';
2+
import fs from 'node:fs/promises';
3+
import { VoyageAIClient } from 'voyageai'; // Adjust import as needed
4+
import { MongoClient, Binary, BSON } from 'mongodb';
5+
import http from 'node:http';
6+
import { GridFSBucket } from 'mongodb';
7+
8+
// COMMAND:
9+
// node --no-warnings --env-file=.env --experimental-strip-types prompt.mts
10+
/**
11+
* Interactive REPL tool for searching movie reviews and posters using MongoDB vector search and VoyageAI embeddings.
12+
*
13+
* Features:
14+
* - Connects to a MongoDB database with collections for movie reviews and poster images.
15+
* - Uses VoyageAI to generate vector embeddings for text and multimodal queries.
16+
* - Caches query vectors in a local `queries.jsonl` file to avoid redundant API calls.
17+
* - Provides two main REPL commands:
18+
* - `r(query: string)`: Search for movie reviews semantically similar to the query.
19+
* - `i(query: string)`: Search for movie posters semantically similar to the query and display them in a local web server.
20+
* - Ensures required MongoDB vector search indexes exist and are queryable.
21+
* - Serves a simple HTML page at http://localhost:2390 to display poster search results.
22+
*
23+
* Environment Variables:
24+
* - `MONGODB_URI`: MongoDB connection string.
25+
* - `VOYAGE_API_KEY`: API key for VoyageAI.
26+
*
27+
* @module prompt
28+
*/
29+
30+
/**
31+
* HOW TO RUN:
32+
*
33+
* 1. Ensure you have a `.env` file with the following variables:
34+
* MONGODB_URI=<your-mongodb-uri>
35+
* VOYAGE_API_KEY=<your-voyageai-api-key>
36+
*
37+
* 2. Install dependencies:
38+
* npm install mongodb voyageai
39+
*
40+
* 3. Run the script with Node.js (v20+ recommended):
41+
* node --no-warnings --env-file=.env --experimental-strip-types prompt.mts
42+
*
43+
* 4. In the REPL:
44+
* - Use `await r('your query')` to search reviews.
45+
* - Use `await i('your query')` to search posters (see results at http://localhost:2390).
46+
*
47+
* 5. All queries and their vectors are cached in `queries.jsonl`.
48+
*/
49+
50+
const EJSON = BSON.EJSON;
51+
52+
// Replace with your actual connection strings and API keys
53+
const MONGODB_URI = process.env.MONGODB_URI;
54+
if (!MONGODB_URI) throw new Error('you must provide MONGODB_URI in the env');
55+
const VOYAGE_API_KEY = process.env.VOYAGE_API_KEY;
56+
if (!VOYAGE_API_KEY)
57+
throw new Error('you must provide VOYAGE_API_KEY in the env');
58+
59+
const voyage = new VoyageAIClient({ apiKey: VOYAGE_API_KEY });
60+
const mongoClient = new MongoClient(MONGODB_URI);
61+
await mongoClient.connect();
62+
63+
const movies = mongoClient.db('movies');
64+
const bucket = new GridFSBucket(movies);
65+
const imageFiles = movies.collection('fs.files');
66+
const reviews = movies.collection('reviews');
67+
68+
const queriesFile = await fs.open('queries.jsonl', 'a');
69+
70+
console.log('\n');
71+
72+
async function ensureIndexes() {
73+
function indexDefFor(
74+
name: string
75+
): (
76+
index: { name: string; queryable: boolean },
77+
_: number,
78+
__: any
79+
) => boolean {
80+
return (idx) => idx.name === name && idx.queryable;
81+
}
82+
83+
let reviewSearchIndexes = (await reviews
84+
.listSearchIndexes()
85+
.toArray()) as unknown as { name: string; queryable: boolean }[];
86+
let imageSearchIndexes = (await imageFiles
87+
.listSearchIndexes()
88+
.toArray()) as unknown as { name: string; queryable: boolean }[];
89+
90+
let tries = 0;
91+
while (
92+
(!reviewSearchIndexes.some(indexDefFor('real_for_real_index')) ||
93+
!imageSearchIndexes.some(indexDefFor('poster_vec_index'))) &&
94+
tries < 3
95+
) {
96+
console.log(`Waiting for indexes to appear (try ${tries + 1})...`);
97+
await new Promise((res) => setTimeout(res, 2000));
98+
reviewSearchIndexes = (await reviews
99+
.listSearchIndexes()
100+
.toArray()) as unknown as { name: string; queryable: boolean }[];
101+
imageSearchIndexes = (await imageFiles
102+
.listSearchIndexes()
103+
.toArray()) as unknown as { name: string; queryable: boolean }[];
104+
tries++;
105+
}
106+
107+
if (!reviewSearchIndexes.some(indexDefFor('real_for_real_index'))) {
108+
console.log('Creating review vector search index: real_for_real_index');
109+
await reviews.createSearchIndex({
110+
name: 'real_for_real_index',
111+
type: 'vectorSearch',
112+
definition: {
113+
fields: [
114+
{
115+
type: 'vector',
116+
path: 'review_vec',
117+
numDimensions: 1024,
118+
similarity: 'dotProduct',
119+
},
120+
],
121+
},
122+
});
123+
} else {
124+
console.log(
125+
'Review vector search index already exists and is queryable: real_for_real_index'
126+
);
127+
}
128+
129+
if (!imageSearchIndexes.some(indexDefFor('poster_vec_index'))) {
130+
console.log('Creating poster vector search index: poster_vec_index');
131+
await imageFiles.createSearchIndex({
132+
name: 'poster_vec_index',
133+
type: 'vectorSearch',
134+
definition: {
135+
fields: [
136+
{
137+
type: 'vector',
138+
path: 'metadata.vector',
139+
numDimensions: 1024,
140+
similarity: 'dotProduct',
141+
},
142+
],
143+
},
144+
});
145+
} else {
146+
console.log(
147+
'Poster vector search index already exists and is queryable: poster_vec_index'
148+
);
149+
}
150+
151+
while (
152+
!reviewSearchIndexes.some(indexDefFor('real_for_real_index')) ||
153+
!imageSearchIndexes.some(indexDefFor('poster_vec_index'))
154+
) {
155+
console.log('Waiting for all indexes to be ready and queryable...');
156+
await new Promise((res) => setTimeout(res, 2000));
157+
reviewSearchIndexes = (await reviews
158+
.listSearchIndexes()
159+
.toArray()) as unknown as { name: string; queryable: boolean }[];
160+
imageSearchIndexes = (await imageFiles
161+
.listSearchIndexes()
162+
.toArray()) as unknown as { name: string; queryable: boolean }[];
163+
}
164+
console.log('All required indexes are ready and queryable.');
165+
}
166+
167+
await ensureIndexes();
168+
169+
async function getCachedVector(query: string): Promise<Binary | null> {
170+
// Read the file from the beginning using fs.readFile
171+
const content = await fs.readFile('queries.jsonl', { encoding: 'utf8' });
172+
const lines = content.split('\n').filter((l) => Boolean(l));
173+
for (const line of lines) {
174+
try {
175+
const entry = EJSON.parse(line, { relaxed: false });
176+
if (entry.query === query && entry.vector) {
177+
// entry.vector is EJSON, convert to Binary
178+
return entry.vector;
179+
}
180+
} catch {
181+
/* ignore parse errors */
182+
}
183+
}
184+
return null;
185+
}
186+
187+
async function r(query: string) {
188+
await mongoClient.connect();
189+
190+
// Try to get vector from cache
191+
let vector = await getCachedVector(query);
192+
193+
if (vector == null) {
194+
// Get vector from VoyageAI
195+
const response = await voyage.embed({
196+
model: 'voyage-3-large',
197+
input: query,
198+
});
199+
200+
vector = Binary.fromFloat32Array(
201+
new Float32Array(response.data[0].embedding)
202+
);
203+
await queriesFile.appendFile(EJSON.stringify({ query, vector }) + '\n');
204+
}
205+
206+
// Run vector search aggregation
207+
const pipeline = [
208+
{
209+
$vectorSearch: {
210+
index: 'real_for_real_index',
211+
path: 'review_vec',
212+
queryVector: vector,
213+
numCandidates: replInstance.context.numCandidates,
214+
limit: replInstance.context.limit,
215+
},
216+
},
217+
{
218+
$project: {
219+
_id: 0,
220+
review: 1,
221+
rating: 1,
222+
score: { $meta: 'vectorSearchScore' },
223+
},
224+
},
225+
];
226+
227+
const results = await reviews.aggregate(pipeline).toArray();
228+
return results;
229+
}
230+
231+
let html = (content) => `
232+
<!DOCTYPE html>
233+
<html lang="en">
234+
<meta charset="UTF-8">
235+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
236+
<body>${content}</body>
237+
</html>
238+
`;
239+
240+
let lastHtml = html('<h2>No posters searched yet.</h2>');
241+
let server: http.Server | null = null;
242+
243+
// Start HTTP server
244+
server = http.createServer((req, res) => {
245+
res.writeHead(200, { 'Content-Type': 'text/html' });
246+
res.end(lastHtml);
247+
});
248+
console.log('Images: http://localhost:2390');
249+
server.listen(2390, 'localhost', () => {});
250+
251+
async function i(query: string) {
252+
await mongoClient.connect();
253+
254+
// Try to get vector from cache
255+
let vector = await getCachedVector(query);
256+
257+
if (vector == null) {
258+
const inputs = [{ content: [{ type: 'text', text: query }] }];
259+
260+
const response = await voyage.multimodalEmbed({
261+
inputs,
262+
model: 'voyage-multimodal-3',
263+
});
264+
vector = Binary.fromFloat32Array(
265+
new Float32Array(response.data[0].embedding)
266+
);
267+
await queriesFile.appendFile(EJSON.stringify({ query, vector }) + '\n');
268+
}
269+
270+
// Vector search for poster files (assume 'poster_vec' field in files collection)
271+
const filesCollection = movies.collection('fs.files');
272+
const pipeline = [
273+
{
274+
$vectorSearch: {
275+
index: 'poster_vec_index',
276+
path: 'metadata.vector',
277+
queryVector: vector,
278+
numCandidates: replInstance.context.numCandidates,
279+
limit: replInstance.context.limit,
280+
},
281+
},
282+
{
283+
$project: {
284+
_id: 1,
285+
filename: 1,
286+
score: { $meta: 'vectorSearchScore' },
287+
},
288+
},
289+
];
290+
const files = await filesCollection.aggregate(pipeline).toArray();
291+
292+
// Fetch image data from GridFS and build HTML
293+
let imageHtml = '<h2>Posters for: ' + query + '</h2>';
294+
for (const file of files) {
295+
try {
296+
const downloadStream = bucket.openDownloadStream(file._id);
297+
const chunks: Buffer[] = [];
298+
for await (const chunk of downloadStream) {
299+
chunks.push(chunk as Buffer);
300+
}
301+
const buffer = Buffer.concat(chunks);
302+
const base64 = buffer.toString('base64');
303+
imageHtml += `<div style="display:inline-block;margin:8px;"><img src="data:image/jpeg;base64,${base64}" style="max-width:200px;max-height:300px;"/><br>${file.filename}</div>`;
304+
} catch (e) {
305+
imageHtml += `<div>Error loading poster: ${file.filename}</div>`;
306+
}
307+
}
308+
lastHtml = html(imageHtml);
309+
return files.map((f) => f.filename);
310+
}
311+
312+
console.log('Use `r(string)` to search for reviews.');
313+
console.log('Use `i(string)` to search posters.');
314+
console.log('All queries are saved to queries.jsonl');
315+
console.log('\n');
316+
317+
const replInstance = repl.start({ prompt: 'niva_neal ✨ ' });
318+
replInstance.context.r = r;
319+
replInstance.context.i = i;
320+
replInstance.context.numCandidates = 100;
321+
replInstance.context.limit = 5;
322+
323+
replInstance.on('exit', async () => {
324+
console.log('bye bye 👋');
325+
await mongoClient.close();
326+
await queriesFile.close();
327+
server?.close();
328+
replInstance.close();
329+
process.exit();
330+
});

0 commit comments

Comments
 (0)