Skip to content

Commit 28c92d6

Browse files
author
ajosh0504
committed
Adding Voyage AI to VS notebook
1 parent fcbf420 commit 28c92d6

File tree

4 files changed

+1101
-93
lines changed

4 files changed

+1101
-93
lines changed

notebooks/utils/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
SLEEP_TIMER = 5
99
SERVERLESS_URL = os.getenv("SERVERLESS_URL")
10-
CODESPACE_NAME = os.getenv("CODESPACE_NAME")
10+
SANDBOX_NAME = os.getenv("CODESPACE_NAME") or os.getenv("_SANDBOX_ID")
1111

1212

1313
def create_index(collection: Collection, index_name: str, model: Dict) -> None:
@@ -82,5 +82,5 @@ def track_progress(task: str, workshop_id: str) -> None:
8282
workshop (str): Workshop name
8383
"""
8484
print(f"Tracking progress for task {task}")
85-
payload = {"task": task, "workshop_id": workshop_id, "sandbox_id": CODESPACE_NAME}
85+
payload = {"task": task, "workshop_id": workshop_id, "sandbox_id": SANDBOX_NAME}
8686
requests.post(url=SERVERLESS_URL, json={"task": "track_progress", "data": payload})

notebooks/vector-search-lab.ipynb

Lines changed: 80 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"source": [
2323
"import os\n",
2424
"from pymongo import MongoClient\n",
25+
"import requests\n",
2526
"from utils import track_progress"
2627
]
2728
},
@@ -32,7 +33,7 @@
3233
"outputs": [],
3334
"source": [
3435
"# If you are using your own MongoDB Atlas cluster, use the connection string for your cluster here\n",
35-
"MONGODB_URI = os.environ.get(\"MONGODB_URI\")\n",
36+
"MONGODB_URI = os.getenv(\"MONGODB_URI\")\n",
3637
"# Initialize a MongoDB Python client\n",
3738
"mongodb_client = MongoClient(MONGODB_URI)\n",
3839
"# Check the connection to the server\n",
@@ -49,6 +50,47 @@
4950
"track_progress(\"cluster_creation\", \"ai_vs_lab\")"
5051
]
5152
},
53+
{
54+
"cell_type": "markdown",
55+
"metadata": {},
56+
"source": [
57+
"Skip the rest of the steps in this section if you are **NOT** at a MongoDB Developer Day. Refer to the [lab documentation](https://mongodb-developer.github.io/vector-search-lab/docs/dev-env/setup-pre-reqs) for information on setting additional prerequisites."
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": null,
63+
"metadata": {},
64+
"outputs": [],
65+
"source": [
66+
"# Set the URL for the AI proxy service\n",
67+
"SERVERLESS_URL = os.getenv(\"SERVERLESS_URL\")"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"# Set the passkey provided by your workshop instructor\n",
77+
"PASSKEY = \"enter-passkey-here\""
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"try:\n",
87+
" # Obtain a Voyage AI API key from our AI proxy service\n",
88+
" os.environ[\"VOYAGE_API_KEY\"] = requests.post(url=SERVERLESS_URL, json={\"task\": \"get_token\", \"data\": PASSKEY}).json()[\"token\"]\n",
89+
"except KeyError:\n",
90+
" # If the passkey has expired, you will need to obtain your own API key and set it in the environment variable `VOYAGE_API_KEY`\n",
91+
" print(\"Token expired. Obtain your own API key: https://docs.voyageai.com/docs/api-key-and-installation#authentication-with-api-keys\")"
92+
]
93+
},
5294
{
5395
"cell_type": "markdown",
5496
"metadata": {},
@@ -128,10 +170,8 @@
128170
"metadata": {},
129171
"outputs": [],
130172
"source": [
131-
"# You may see a warning upon running this cell. You can ignore it.\n",
132-
"from sentence_transformers import SentenceTransformer\n",
133173
"from PIL import Image\n",
134-
"import requests"
174+
"import voyageai"
135175
]
136176
},
137177
{
@@ -140,8 +180,8 @@
140180
"metadata": {},
141181
"outputs": [],
142182
"source": [
143-
"# Load a multimodal embedding model using the Sentence Transformers library\n",
144-
"embedding_model = SentenceTransformer(\"clip-ViT-B-32\")"
183+
"# Initialize the Voyage AI client\n",
184+
"vo = voyageai.Client()"
145185
]
146186
},
147187
{
@@ -150,7 +190,7 @@
150190
"source": [
151191
"### For images\n",
152192
"\n",
153-
"📚 https://huggingface.co/sentence-transformers/clip-ViT-B-32#usage"
193+
"📚 https://docs.voyageai.com/docs/multimodal-embeddings#python-api (See the Example)"
154194
]
155195
},
156196
{
@@ -162,8 +202,10 @@
162202
"image_url = \"https://images.isbndb.com/covers/4318463482198.jpg\"\n",
163203
"# Load the image from the URL above\n",
164204
"image = Image.open(requests.get(image_url, stream=True).raw)\n",
165-
"# Embed the `image` using the `embedding_model` instantiated above and return the embedding as a list\n",
166-
"# An array can be converted to a list using the `tolist()` method\n",
205+
"# Use the `multimodal_embed` method of the Voyage AI API with the following arguments to embed the image:\n",
206+
"# inputs: The image wrapped in a list of lists\n",
207+
"# model: `voyage-multimodal-3`\n",
208+
"# input_type: \"query\" or \"document\". Doesn't matter for this example\n",
167209
"embedding = <CODE_BLOCK_1>"
168210
]
169211
},
@@ -173,7 +215,8 @@
173215
"metadata": {},
174216
"outputs": [],
175217
"source": [
176-
"print(embedding)"
218+
"# Get the embeddings as a list from the `embedding` object\n",
219+
"<CODE_BLOCK_2>"
177220
]
178221
},
179222
{
@@ -190,8 +233,8 @@
190233
"outputs": [],
191234
"source": [
192235
"text = \"Puppy Preschool: Raising Your Puppy Right---Right from the Start!\"\n",
193-
"# Use the same `embedding_model` to embed a piece of text\n",
194-
"embedding = embedding_model.encode(text).tolist()"
236+
"# Use the `multimodal_embed` method to embed a piece of text\n",
237+
"embedding = vo.multimodal_embed(inputs=[[text]], model=\"voyage-multimodal-3\")"
195238
]
196239
},
197240
{
@@ -200,7 +243,8 @@
200243
"metadata": {},
201244
"outputs": [],
202245
"source": [
203-
"print(embedding)"
246+
"# Get the embeddings as a list from the `embedding` object\n",
247+
"embedding.embeddings[0]"
204248
]
205249
},
206250
{
@@ -238,21 +282,22 @@
238282
"metadata": {},
239283
"outputs": [],
240284
"source": [
241-
"def get_embedding(content: str, mode: str) -> List[float]:\n",
285+
"def get_embedding(content: str, mode: str, input_type: str) -> List[float]:\n",
242286
" \"\"\"\n",
243287
" Generate embeddings\n",
244288
"\n",
245289
" Args:\n",
246290
" content (str): Content to embed\n",
247291
" mode (str): Content mode (Can be one of \"image\" or \"text\")\n",
292+
" input_type (str): Type of input, either \"document\" or \"query\"\n",
248293
"\n",
249294
" Returns:\n",
250295
" List[float]: Embedding of the content as a list.\n",
251296
" \"\"\"\n",
252297
" # If an image URL is provided, first load the image\n",
253298
" if mode == \"image\":\n",
254299
" content = Image.open(requests.get(content, stream=True).raw)\n",
255-
" return embedding_model.encode(content).tolist()"
300+
" return vo.multimodal_embed(inputs=[[content]], model=\"voyage-multimodal-3\", input_type=input_type).embeddings[0]"
256301
]
257302
},
258303
{
@@ -268,8 +313,8 @@
268313
"metadata": {},
269314
"outputs": [],
270315
"source": [
271-
"# Query for all documents in the `collection` collection.\n",
272-
"results = <CODE_BLOCK_2>"
316+
"# Query for all documents in the `collection` collection\n",
317+
"results = <CODE_BLOCK_3>"
273318
]
274319
},
275320
{
@@ -291,17 +336,16 @@
291336
"for result in tqdm(results):\n",
292337
" content = result[field_to_embed]\n",
293338
" # Use the `get_embedding` function defined above to embed the `content`\n",
294-
" # Note that `content` contains the cover image URL for the book \n",
295-
" embedding = <CODE_BLOCK_3>\n",
296-
"\n",
297-
" \n",
339+
" # Note that `content` is going to be the cover image of the book, so set the `mode` accordingly\n",
340+
" # `input_type` should be set to \"document\" since we are embedding the \"documents\" we want to search\n",
341+
" embedding = <CODE_BLOCK_4>\n",
298342
" # Filter for the document where the `_id` field is equal to the `_id` of the current document\n",
299343
" filter = {\"_id\": result[\"_id\"]}\n",
300344
" # Set the `embedding_field` field to the value `embedding` using the `$set` operator\n",
301-
" update = <CODE_BLOCK_4>\n",
345+
" update = <CODE_BLOCK_5>\n",
302346
" # Update the documents in the `collection` collection inplace using the `update_one()` operation\n",
303347
" # Get the right document `_id` using the `filter` and apply the `update`\n",
304-
" <CODE_BLOCK_5>"
348+
" <CODE_BLOCK_6>"
305349
]
306350
},
307351
{
@@ -338,7 +382,7 @@
338382
" {\n",
339383
" \"type\": \"vector\",\n",
340384
" \"path\": \"embedding\",\n",
341-
" \"numDimensions\": 512,\n",
385+
" \"numDimensions\": 1024,\n",
342386
" \"similarity\": \"cosine\",\n",
343387
" }\n",
344388
" ]\n",
@@ -360,7 +404,7 @@
360404
"outputs": [],
361405
"source": [
362406
"# Use the `create_index` function from the `utils` module to create a vector search index with the above definition for the `collection` collection\n",
363-
"<CODE_BLOCK_6>"
407+
"<CODE_BLOCK_7>"
364408
]
365409
},
366410
{
@@ -411,17 +455,18 @@
411455
" filter (Optional[Dict], optional): Optional vector search pre-filter\n",
412456
" \"\"\"\n",
413457
" # Generate embedding for the `user_query` using the `get_embedding` function defined in Step 4\n",
414-
" query_embedding = <CODE_BLOCK_7>\n",
458+
" # `input_type` should be set to \"query\" since we are embedding the query\n",
459+
" query_embedding = <CODE_BLOCK_8>\n",
415460
"\n",
416461
" # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage\n",
417462
" # Set the number of candidates to 50 and only return the top 5 documents from the vector search\n",
418463
" # Set the `filter` field in the $vectorSearch stage to the value `filter` passed to the function\n",
419464
" # In the $project stage, exclude the `_id` field, include these fields: `title`, `cover`, `year`, `pages`, and the `vectorSearchScore`\n",
420465
" # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage\n",
421-
" pipeline = <CODE_BLOCK_8>\n",
466+
" pipeline = <CODE_BLOCK_9>\n",
422467
"\n",
423468
" # Execute the aggregation `pipeline` and store the results in `results`\n",
424-
" results = <CODE_BLOCK_9>\n",
469+
" results = <CODE_BLOCK_10>\n",
425470
"\n",
426471
" # Print book title, score, and cover image\n",
427472
" for book in results:\n",
@@ -487,7 +532,7 @@
487532
"outputs": [],
488533
"source": [
489534
"# Modify the vector search index `model` from Step 5 to include the `year` field as a `filter` field\n",
490-
"model = <CODE_BLOCK_10>"
535+
"model = <CODE_BLOCK_11>"
491536
]
492537
},
493538
{
@@ -524,7 +569,7 @@
524569
"outputs": [],
525570
"source": [
526571
"# Create a filter definition to filter for books where the `year` field is greater than `2002` using the `$gte` operator\n",
527-
"filter = <CODE_BLOCK_11>\n",
572+
"filter = <CODE_BLOCK_12>\n",
528573
"# Pass the `filter` as an argument to the `vector_search` function.\n",
529574
"# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n",
530575
"vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)"
@@ -546,7 +591,7 @@
546591
"outputs": [],
547592
"source": [
548593
"# Modify the vector search index `model` from Step 5 to include `year` and `pages` as filter fields\n",
549-
"model = <CODE_BLOCK_12>"
594+
"model = <CODE_BLOCK_13>"
550595
]
551596
},
552597
{
@@ -584,7 +629,7 @@
584629
"source": [
585630
"# Create a filter definition to filter for books where the `year` field is greater than or equal to `2002` and the `pages` field is less than or equal to 250\n",
586631
"# Use the `$gte` and `$lte` operators\n",
587-
"filter = <CODE_BLOCK_13>\n",
632+
"filter = <CODE_BLOCK_14>\n",
588633
"# Pass the `filter` as an argument to the `vector_search` function.\n",
589634
"# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n",
590635
"vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)"
@@ -594,62 +639,7 @@
594639
"cell_type": "markdown",
595640
"metadata": {},
596641
"source": [
597-
"# Step 8: Changing the similarity metric"
598-
]
599-
},
600-
{
601-
"cell_type": "markdown",
602-
"metadata": {},
603-
"source": [
604-
"📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax"
605-
]
606-
},
607-
{
608-
"cell_type": "code",
609-
"execution_count": null,
610-
"metadata": {},
611-
"outputs": [],
612-
"source": [
613-
"# Modify the vector search index `model` from Step 5 to change the similarity metric to `dotProduct`\n",
614-
"model = <CODE_BLOCK_14>"
615-
]
616-
},
617-
{
618-
"cell_type": "code",
619-
"execution_count": null,
620-
"metadata": {},
621-
"outputs": [],
622-
"source": [
623-
"# Use the `create_index` function from the `utils` module to re-create the vector search index with the modified model\n",
624-
"create_index(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME, model)"
625-
]
626-
},
627-
{
628-
"cell_type": "code",
629-
"execution_count": null,
630-
"metadata": {},
631-
"outputs": [],
632-
"source": [
633-
"# Use the `check_index_ready` function from the `utils` module to verify that the index definition has the correct similarity metric and is in READY status before proceeding\n",
634-
"check_index_ready(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME)"
635-
]
636-
},
637-
{
638-
"cell_type": "code",
639-
"execution_count": null,
640-
"metadata": {},
641-
"outputs": [],
642-
"source": [
643-
"# Perform a vector search\n",
644-
"# Note any differences in the results due to the different similarity metric\n",
645-
"vector_search(\"A peaceful and uplifting atmosphere\", \"text\")"
646-
]
647-
},
648-
{
649-
"cell_type": "markdown",
650-
"metadata": {},
651-
"source": [
652-
"# 🦹‍♀️ Enable vector quantization\n",
642+
"# Step 8: Enable vector quantization\n",
653643
"\n",
654644
"📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax"
655645
]
@@ -742,7 +732,7 @@
742732
" {\n",
743733
" \"type\": \"vector\",\n",
744734
" \"path\": \"embedding\",\n",
745-
" \"numDimensions\": 512,\n",
735+
" \"numDimensions\": 1024,\n",
746736
" \"similarity\": \"cosine\",\n",
747737
" }\n",
748738
" ]\n",
@@ -794,7 +784,7 @@
794784
" \"$vectorSearch\": {\n",
795785
" \"index\": ATLAS_VECTOR_SEARCH_INDEX_NAME,\n",
796786
" \"path\": \"embedding\",\n",
797-
" \"queryVector\": get_embedding(user_query, \"text\"),\n",
787+
" \"queryVector\": get_embedding(user_query, \"text\", \"query\"),\n",
798788
" \"numCandidates\": 50,\n",
799789
" \"limit\": 10,\n",
800790
" }\n",

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ langchain-openai==0.3.16
66
langgraph==0.4.2
77
langgraph-checkpoint-mongodb==0.1.3
88
tiktoken==0.9.0
9-
sentence_transformers==4.1.0
9+
voyageai==0.3.4
1010
tqdm==4.67.1
1111
Pillow==11.1.0

0 commit comments

Comments
 (0)