|
22 | 22 | "source": [ |
23 | 23 | "import os\n", |
24 | 24 | "from pymongo import MongoClient\n", |
| 25 | + "import requests\n", |
25 | 26 | "from utils import track_progress" |
26 | 27 | ] |
27 | 28 | }, |
|
32 | 33 | "outputs": [], |
33 | 34 | "source": [ |
34 | 35 | "# If you are using your own MongoDB Atlas cluster, use the connection string for your cluster here\n", |
35 | | - "MONGODB_URI = os.environ.get(\"MONGODB_URI\")\n", |
| 36 | + "MONGODB_URI = os.getenv(\"MONGODB_URI\")\n", |
36 | 37 | "# Initialize a MongoDB Python client\n", |
37 | 38 | "mongodb_client = MongoClient(MONGODB_URI)\n", |
38 | 39 | "# Check the connection to the server\n", |
|
49 | 50 | "track_progress(\"cluster_creation\", \"ai_vs_lab\")" |
50 | 51 | ] |
51 | 52 | }, |
| 53 | + { |
| 54 | + "cell_type": "markdown", |
| 55 | + "metadata": {}, |
| 56 | + "source": [ |
| 57 | + "Skip the rest of the steps in this section if you are **NOT** at a MongoDB Developer Day. Refer to the [lab documentation](https://mongodb-developer.github.io/vector-search-lab/docs/dev-env/setup-pre-reqs) for information on setting additional prerequisites." |
| 58 | + ] |
| 59 | + }, |
| 60 | + { |
| 61 | + "cell_type": "code", |
| 62 | + "execution_count": null, |
| 63 | + "metadata": {}, |
| 64 | + "outputs": [], |
| 65 | + "source": [ |
| 66 | + "# Set the URL for the AI proxy service\n", |
| 67 | + "SERVERLESS_URL = os.getenv(\"SERVERLESS_URL\")" |
| 68 | + ] |
| 69 | + }, |
| 70 | + { |
| 71 | + "cell_type": "code", |
| 72 | + "execution_count": null, |
| 73 | + "metadata": {}, |
| 74 | + "outputs": [], |
| 75 | + "source": [ |
| 76 | + "# Set the passkey provided by your workshop instructor\n", |
| 77 | + "PASSKEY = \"enter-passkey-here\"" |
| 78 | + ] |
| 79 | + }, |
| 80 | + { |
| 81 | + "cell_type": "code", |
| 82 | + "execution_count": null, |
| 83 | + "metadata": {}, |
| 84 | + "outputs": [], |
| 85 | + "source": [ |
| 86 | + "try:\n", |
| 87 | + " # Obtain a Voyage AI API key from our AI proxy service\n", |
| 88 | + " os.environ[\"VOYAGE_API_KEY\"] = requests.post(url=SERVERLESS_URL, json={\"task\": \"get_token\", \"data\": PASSKEY}).json()[\"token\"]\n", |
| 89 | + "except KeyError:\n", |
| 90 | + " # If the passkey has expired, you will need to obtain your own API key and set it in the environment variable `VOYAGE_API_KEY`\n", |
| 91 | + " print(\"Token expired. Obtain your own API key: https://docs.voyageai.com/docs/api-key-and-installation#authentication-with-api-keys\")" |
| 92 | + ] |
| 93 | + }, |
52 | 94 | { |
53 | 95 | "cell_type": "markdown", |
54 | 96 | "metadata": {}, |
|
128 | 170 | "metadata": {}, |
129 | 171 | "outputs": [], |
130 | 172 | "source": [ |
131 | | - "# You may see a warning upon running this cell. You can ignore it.\n", |
132 | | - "from sentence_transformers import SentenceTransformer\n", |
133 | 173 | "from PIL import Image\n", |
134 | | - "import requests" |
| 174 | + "import voyageai" |
135 | 175 | ] |
136 | 176 | }, |
137 | 177 | { |
|
140 | 180 | "metadata": {}, |
141 | 181 | "outputs": [], |
142 | 182 | "source": [ |
143 | | - "# Load a multimodal embedding model using the Sentence Transformers library\n", |
144 | | - "embedding_model = SentenceTransformer(\"clip-ViT-B-32\")" |
| 183 | + "# Initialize the Voyage AI client\n", |
| 184 | + "vo = voyageai.Client()" |
145 | 185 | ] |
146 | 186 | }, |
147 | 187 | { |
|
150 | 190 | "source": [ |
151 | 191 | "### For images\n", |
152 | 192 | "\n", |
153 | | - "📚 https://huggingface.co/sentence-transformers/clip-ViT-B-32#usage" |
| 193 | + "📚 https://docs.voyageai.com/docs/multimodal-embeddings#python-api (See the Example)" |
154 | 194 | ] |
155 | 195 | }, |
156 | 196 | { |
|
162 | 202 | "image_url = \"https://images.isbndb.com/covers/4318463482198.jpg\"\n", |
163 | 203 | "# Load the image from the URL above\n", |
164 | 204 | "image = Image.open(requests.get(image_url, stream=True).raw)\n", |
165 | | - "# Embed the `image` using the `embedding_model` instantiated above and return the embedding as a list\n", |
166 | | - "# An array can be converted to a list using the `tolist()` method\n", |
| 205 | + "# Use the `multimodal_embed` method of the Voyage AI API with the following arguments to embed the image:\n", |
| 206 | + "# inputs: The image wrapped in a list of lists\n", |
| 207 | + "# model: `voyage-multimodal-3`\n", |
| 208 | + "# input_type: \"query\" or \"document\". Doesn't matter for this example\n", |
167 | 209 | "embedding = <CODE_BLOCK_1>" |
168 | 210 | ] |
169 | 211 | }, |
|
173 | 215 | "metadata": {}, |
174 | 216 | "outputs": [], |
175 | 217 | "source": [ |
176 | | - "print(embedding)" |
| 218 | + "# Get the embeddings as a list from the `embedding` object\n", |
| 219 | + "<CODE_BLOCK_2>" |
177 | 220 | ] |
178 | 221 | }, |
179 | 222 | { |
|
190 | 233 | "outputs": [], |
191 | 234 | "source": [ |
192 | 235 | "text = \"Puppy Preschool: Raising Your Puppy Right---Right from the Start!\"\n", |
193 | | - "# Use the same `embedding_model` to embed a piece of text\n", |
194 | | - "embedding = embedding_model.encode(text).tolist()" |
| 236 | + "# Use the `multimodal_embed` method to embed a piece of text\n", |
| 237 | + "embedding = vo.multimodal_embed(inputs=[[text]], model=\"voyage-multimodal-3\")" |
195 | 238 | ] |
196 | 239 | }, |
197 | 240 | { |
|
200 | 243 | "metadata": {}, |
201 | 244 | "outputs": [], |
202 | 245 | "source": [ |
203 | | - "print(embedding)" |
| 246 | + "# Get the embeddings as a list from the `embedding` object\n", |
| 247 | + "embedding.embeddings[0]" |
204 | 248 | ] |
205 | 249 | }, |
206 | 250 | { |
|
238 | 282 | "metadata": {}, |
239 | 283 | "outputs": [], |
240 | 284 | "source": [ |
241 | | - "def get_embedding(content: str, mode: str) -> List[float]:\n", |
| 285 | + "def get_embedding(content: str, mode: str, input_type: str) -> List[float]:\n", |
242 | 286 | " \"\"\"\n", |
243 | 287 | " Generate embeddings\n", |
244 | 288 | "\n", |
245 | 289 | " Args:\n", |
246 | 290 | " content (str): Content to embed\n", |
247 | 291 | " mode (str): Content mode (Can be one of \"image\" or \"text\")\n", |
| 292 | + " input_type (str): Type of input, either \"document\" or \"query\"\n", |
248 | 293 | "\n", |
249 | 294 | " Returns:\n", |
250 | 295 | " List[float]: Embedding of the content as a list.\n", |
251 | 296 | " \"\"\"\n", |
252 | 297 | " # If an image URL is provided, first load the image\n", |
253 | 298 | " if mode == \"image\":\n", |
254 | 299 | " content = Image.open(requests.get(content, stream=True).raw)\n", |
255 | | - " return embedding_model.encode(content).tolist()" |
| 300 | + " return vo.multimodal_embed(inputs=[[content]], model=\"voyage-multimodal-3\", input_type=input_type).embeddings[0]" |
256 | 301 | ] |
257 | 302 | }, |
258 | 303 | { |
|
268 | 313 | "metadata": {}, |
269 | 314 | "outputs": [], |
270 | 315 | "source": [ |
271 | | - "# Query for all documents in the `collection` collection.\n", |
272 | | - "results = <CODE_BLOCK_2>" |
| 316 | + "# Query for all documents in the `collection` collection\n", |
| 317 | + "results = <CODE_BLOCK_3>" |
273 | 318 | ] |
274 | 319 | }, |
275 | 320 | { |
|
291 | 336 | "for result in tqdm(results):\n", |
292 | 337 | " content = result[field_to_embed]\n", |
293 | 338 | " # Use the `get_embedding` function defined above to embed the `content`\n", |
294 | | - " # Note that `content` contains the cover image URL for the book \n", |
295 | | - " embedding = <CODE_BLOCK_3>\n", |
296 | | - "\n", |
297 | | - " \n", |
| 339 | + " # Note that `content` is going to be the cover image of the book, so set the `mode` accordingly\n", |
| 340 | + " # `input_type` should be set to \"document\" since we are embedding the \"documents\" we want to search\n", |
| 341 | + " embedding = <CODE_BLOCK_4>\n", |
298 | 342 | " # Filter for the document where the `_id` field is equal to the `_id` of the current document\n", |
299 | 343 | " filter = {\"_id\": result[\"_id\"]}\n", |
300 | 344 | " # Set the `embedding_field` field to the value `embedding` using the `$set` operator\n", |
301 | | - " update = <CODE_BLOCK_4>\n", |
| 345 | + " update = <CODE_BLOCK_5>\n", |
302 | 346 | " # Update the documents in the `collection` collection inplace using the `update_one()` operation\n", |
303 | 347 | " # Get the right document `_id` using the `filter` and apply the `update`\n", |
304 | | - " <CODE_BLOCK_5>" |
| 348 | + " <CODE_BLOCK_6>" |
305 | 349 | ] |
306 | 350 | }, |
307 | 351 | { |
|
338 | 382 | " {\n", |
339 | 383 | " \"type\": \"vector\",\n", |
340 | 384 | " \"path\": \"embedding\",\n", |
341 | | - " \"numDimensions\": 512,\n", |
| 385 | + " \"numDimensions\": 1024,\n", |
342 | 386 | " \"similarity\": \"cosine\",\n", |
343 | 387 | " }\n", |
344 | 388 | " ]\n", |
|
360 | 404 | "outputs": [], |
361 | 405 | "source": [ |
362 | 406 | "# Use the `create_index` function from the `utils` module to create a vector search index with the above definition for the `collection` collection\n", |
363 | | - "<CODE_BLOCK_6>" |
| 407 | + "<CODE_BLOCK_7>" |
364 | 408 | ] |
365 | 409 | }, |
366 | 410 | { |
|
411 | 455 | " filter (Optional[Dict], optional): Optional vector search pre-filter\n", |
412 | 456 | " \"\"\"\n", |
413 | 457 | " # Generate embedding for the `user_query` using the `get_embedding` function defined in Step 4\n", |
414 | | - " query_embedding = <CODE_BLOCK_7>\n", |
| 458 | + " # `input_type` should be set to \"query\" since we are embedding the query\n", |
| 459 | + " query_embedding = <CODE_BLOCK_8>\n", |
415 | 460 | "\n", |
416 | 461 | " # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage\n", |
417 | 462 | " # Set the number of candidates to 50 and only return the top 5 documents from the vector search\n", |
418 | 463 | " # Set the `filter` field in the $vectorSearch stage to the value `filter` passed to the function\n", |
419 | 464 | " # In the $project stage, exclude the `_id` field, include these fields: `title`, `cover`, `year`, `pages`, and the `vectorSearchScore`\n", |
420 | 465 | " # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage\n", |
421 | | - " pipeline = <CODE_BLOCK_8>\n", |
| 466 | + " pipeline = <CODE_BLOCK_9>\n", |
422 | 467 | "\n", |
423 | 468 | " # Execute the aggregation `pipeline` and store the results in `results`\n", |
424 | | - " results = <CODE_BLOCK_9>\n", |
| 469 | + " results = <CODE_BLOCK_10>\n", |
425 | 470 | "\n", |
426 | 471 | " # Print book title, score, and cover image\n", |
427 | 472 | " for book in results:\n", |
|
487 | 532 | "outputs": [], |
488 | 533 | "source": [ |
489 | 534 | "# Modify the vector search index `model` from Step 5 to include the `year` field as a `filter` field\n", |
490 | | - "model = <CODE_BLOCK_10>" |
| 535 | + "model = <CODE_BLOCK_11>" |
491 | 536 | ] |
492 | 537 | }, |
493 | 538 | { |
|
524 | 569 | "outputs": [], |
525 | 570 | "source": [ |
526 | 571 | "# Create a filter definition to filter for books where the `year` field is greater than `2002` using the `$gte` operator\n", |
527 | | - "filter = <CODE_BLOCK_11>\n", |
| 572 | + "filter = <CODE_BLOCK_12>\n", |
528 | 573 | "# Pass the `filter` as an argument to the `vector_search` function.\n", |
529 | 574 | "# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n", |
530 | 575 | "vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)" |
|
546 | 591 | "outputs": [], |
547 | 592 | "source": [ |
548 | 593 | "# Modify the vector search index `model` from Step 5 to include `year` and `pages` as filter fields\n", |
549 | | - "model = <CODE_BLOCK_12>" |
| 594 | + "model = <CODE_BLOCK_13>" |
550 | 595 | ] |
551 | 596 | }, |
552 | 597 | { |
|
584 | 629 | "source": [ |
585 | 630 | "# Create a filter definition to filter for books where the `year` field is greater than or equal to `2002` and the `pages` field is less than or equal to 250\n", |
586 | 631 | "# Use the `$gte` and `$lte` operators\n", |
587 | | - "filter = <CODE_BLOCK_13>\n", |
| 632 | + "filter = <CODE_BLOCK_14>\n", |
588 | 633 | "# Pass the `filter` as an argument to the `vector_search` function.\n", |
589 | 634 | "# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n", |
590 | 635 | "vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)" |
|
594 | 639 | "cell_type": "markdown", |
595 | 640 | "metadata": {}, |
596 | 641 | "source": [ |
597 | | - "# Step 8: Changing the similarity metric" |
598 | | - ] |
599 | | - }, |
600 | | - { |
601 | | - "cell_type": "markdown", |
602 | | - "metadata": {}, |
603 | | - "source": [ |
604 | | - "📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax" |
605 | | - ] |
606 | | - }, |
607 | | - { |
608 | | - "cell_type": "code", |
609 | | - "execution_count": null, |
610 | | - "metadata": {}, |
611 | | - "outputs": [], |
612 | | - "source": [ |
613 | | - "# Modify the vector search index `model` from Step 5 to change the similarity metric to `dotProduct`\n", |
614 | | - "model = <CODE_BLOCK_14>" |
615 | | - ] |
616 | | - }, |
617 | | - { |
618 | | - "cell_type": "code", |
619 | | - "execution_count": null, |
620 | | - "metadata": {}, |
621 | | - "outputs": [], |
622 | | - "source": [ |
623 | | - "# Use the `create_index` function from the `utils` module to re-create the vector search index with the modified model\n", |
624 | | - "create_index(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME, model)" |
625 | | - ] |
626 | | - }, |
627 | | - { |
628 | | - "cell_type": "code", |
629 | | - "execution_count": null, |
630 | | - "metadata": {}, |
631 | | - "outputs": [], |
632 | | - "source": [ |
633 | | - "# Use the `check_index_ready` function from the `utils` module to verify that the index definition has the correct similarity metric and is in READY status before proceeding\n", |
634 | | - "check_index_ready(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME)" |
635 | | - ] |
636 | | - }, |
637 | | - { |
638 | | - "cell_type": "code", |
639 | | - "execution_count": null, |
640 | | - "metadata": {}, |
641 | | - "outputs": [], |
642 | | - "source": [ |
643 | | - "# Perform a vector search\n", |
644 | | - "# Note any differences in the results due to the different similarity metric\n", |
645 | | - "vector_search(\"A peaceful and uplifting atmosphere\", \"text\")" |
646 | | - ] |
647 | | - }, |
648 | | - { |
649 | | - "cell_type": "markdown", |
650 | | - "metadata": {}, |
651 | | - "source": [ |
652 | | - "# 🦹♀️ Enable vector quantization\n", |
| 642 | + "# Step 8: Enable vector quantization\n", |
653 | 643 | "\n", |
654 | 644 | "📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax" |
655 | 645 | ] |
|
742 | 732 | " {\n", |
743 | 733 | " \"type\": \"vector\",\n", |
744 | 734 | " \"path\": \"embedding\",\n", |
745 | | - " \"numDimensions\": 512,\n", |
| 735 | + " \"numDimensions\": 1024,\n", |
746 | 736 | " \"similarity\": \"cosine\",\n", |
747 | 737 | " }\n", |
748 | 738 | " ]\n", |
|
794 | 784 | " \"$vectorSearch\": {\n", |
795 | 785 | " \"index\": ATLAS_VECTOR_SEARCH_INDEX_NAME,\n", |
796 | 786 | " \"path\": \"embedding\",\n", |
797 | | - " \"queryVector\": get_embedding(user_query, \"text\"),\n", |
| 787 | + " \"queryVector\": get_embedding(user_query, \"text\", \"query\"),\n", |
798 | 788 | " \"numCandidates\": 50,\n", |
799 | 789 | " \"limit\": 10,\n", |
800 | 790 | " }\n", |
|
0 commit comments