Adding Voyage AI to VS notebook

ajosh0504 · ajosh0504 · commit 28c92d6e34c7 · 2025-08-21T14:45:18.000-07:00
diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py
@@ -7,7 +7,7 @@
 
 SLEEP_TIMER = 5
 SERVERLESS_URL = os.getenv("SERVERLESS_URL")
-CODESPACE_NAME = os.getenv("CODESPACE_NAME")
+SANDBOX_NAME = os.getenv("CODESPACE_NAME") or os.getenv("_SANDBOX_ID")
 
 
 def create_index(collection: Collection, index_name: str, model: Dict) -> None:
@@ -82,5 +82,5 @@ def track_progress(task: str, workshop_id: str) -> None:
         workshop (str): Workshop name
     """
     print(f"Tracking progress for task {task}")
-    payload = {"task": task, "workshop_id": workshop_id, "sandbox_id": CODESPACE_NAME}
+    payload = {"task": task, "workshop_id": workshop_id, "sandbox_id": SANDBOX_NAME}
     requests.post(url=SERVERLESS_URL, json={"task": "track_progress", "data": payload})
diff --git a/notebooks/vector-search-lab.ipynb b/notebooks/vector-search-lab.ipynb
@@ -22,6 +22,7 @@
       "source": [
         "import os\n",
         "from pymongo import MongoClient\n",
+        "import requests\n",
         "from utils import track_progress"
       ]
     },
@@ -32,7 +33,7 @@
       "outputs": [],
       "source": [
         "# If you are using your own MongoDB Atlas cluster, use the connection string for your cluster here\n",
-        "MONGODB_URI = os.environ.get(\"MONGODB_URI\")\n",
+        "MONGODB_URI = os.getenv(\"MONGODB_URI\")\n",
         "# Initialize a MongoDB Python client\n",
         "mongodb_client = MongoClient(MONGODB_URI)\n",
         "# Check the connection to the server\n",
@@ -49,6 +50,47 @@
         "track_progress(\"cluster_creation\", \"ai_vs_lab\")"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Skip the rest of the steps in this section if you are **NOT** at a MongoDB Developer Day. Refer to the [lab documentation](https://mongodb-developer.github.io/vector-search-lab/docs/dev-env/setup-pre-reqs) for information on setting additional prerequisites."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set the URL for the AI proxy service\n",
+        "SERVERLESS_URL = os.getenv(\"SERVERLESS_URL\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Set the passkey provided by your workshop instructor\n",
+        "PASSKEY = \"enter-passkey-here\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "try:\n",
+        "    # Obtain a Voyage AI API key from our AI proxy service\n",
+        "    os.environ[\"VOYAGE_API_KEY\"] = requests.post(url=SERVERLESS_URL, json={\"task\": \"get_token\", \"data\": PASSKEY}).json()[\"token\"]\n",
+        "except KeyError:\n",
+        "    # If the passkey has expired, you will need to obtain your own API key and set it in the environment variable `VOYAGE_API_KEY`\n",
+        "    print(\"Token expired. Obtain your own API key: https://docs.voyageai.com/docs/api-key-and-installation#authentication-with-api-keys\")"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -128,10 +170,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# You may see a warning upon running this cell. You can ignore it.\n",
-        "from sentence_transformers import SentenceTransformer\n",
         "from PIL import Image\n",
-        "import requests"
+        "import voyageai"
       ]
     },
     {
@@ -140,8 +180,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# Load a multimodal embedding model using the Sentence Transformers library\n",
-        "embedding_model = SentenceTransformer(\"clip-ViT-B-32\")"
+        "# Initialize the Voyage AI client\n",
+        "vo = voyageai.Client()"
       ]
     },
     {
@@ -150,7 +190,7 @@
       "source": [
         "### For images\n",
         "\n",
-        "📚 https://huggingface.co/sentence-transformers/clip-ViT-B-32#usage"
+        "📚 https://docs.voyageai.com/docs/multimodal-embeddings#python-api (See the Example)"
       ]
     },
     {
@@ -162,8 +202,10 @@
         "image_url = \"https://images.isbndb.com/covers/4318463482198.jpg\"\n",
         "# Load the image from the URL above\n",
         "image = Image.open(requests.get(image_url, stream=True).raw)\n",
-        "# Embed the `image` using the `embedding_model` instantiated above and return the embedding as a list\n",
-        "# An array can be converted to a list using the `tolist()` method\n",
+        "# Use the `multimodal_embed` method of the Voyage AI API with the following arguments to embed the image:\n",
+        "# inputs: The image wrapped in a list of lists\n",
+        "# model: `voyage-multimodal-3`\n",
+        "# input_type: \"query\" or \"document\". Doesn't matter for this example\n",
         "embedding = <CODE_BLOCK_1>"
       ]
     },
@@ -173,7 +215,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "print(embedding)"
+        "# Get the embeddings as a list from the `embedding` object\n",
+        "<CODE_BLOCK_2>"
       ]
     },
     {
@@ -190,8 +233,8 @@
       "outputs": [],
       "source": [
         "text = \"Puppy Preschool: Raising Your Puppy Right---Right from the Start!\"\n",
-        "# Use the same `embedding_model` to embed a piece of text\n",
-        "embedding = embedding_model.encode(text).tolist()"
+        "# Use the `multimodal_embed` method to embed a piece of text\n",
+        "embedding = vo.multimodal_embed(inputs=[[text]], model=\"voyage-multimodal-3\")"
       ]
     },
     {
@@ -200,7 +243,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "print(embedding)"
+        "# Get the embeddings as a list from the `embedding` object\n",
+        "embedding.embeddings[0]"
       ]
     },
     {
@@ -238,21 +282,22 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "def get_embedding(content: str, mode: str) -> List[float]:\n",
+        "def get_embedding(content: str, mode: str, input_type: str) -> List[float]:\n",
         "    \"\"\"\n",
         "    Generate embeddings\n",
         "\n",
         "    Args:\n",
         "        content (str): Content to embed\n",
         "        mode (str): Content mode (Can be one of \"image\" or \"text\")\n",
+        "        input_type (str): Type of input, either \"document\" or \"query\"\n",
         "\n",
         "    Returns:\n",
         "        List[float]: Embedding of the content as a list.\n",
         "    \"\"\"\n",
         "    # If an image URL is provided, first load the image\n",
         "    if mode == \"image\":\n",
         "        content = Image.open(requests.get(content, stream=True).raw)\n",
-        "    return embedding_model.encode(content).tolist()"
+        "    return vo.multimodal_embed(inputs=[[content]], model=\"voyage-multimodal-3\", input_type=input_type).embeddings[0]"
       ]
     },
     {
@@ -268,8 +313,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# Query for all documents in the `collection` collection.\n",
-        "results = <CODE_BLOCK_2>"
+        "# Query for all documents in the `collection` collection\n",
+        "results = <CODE_BLOCK_3>"
       ]
     },
     {
@@ -291,17 +336,16 @@
         "for result in tqdm(results):\n",
         "    content = result[field_to_embed]\n",
         "    # Use the `get_embedding` function defined above to embed the `content`\n",
-        "    # Note that `content` contains the cover image URL for the book \n",
-        "    embedding = <CODE_BLOCK_3>\n",
-        "\n",
-        "    \n",
+        "    # Note that `content` is going to be the cover image of the book, so set the `mode` accordingly\n",
+        "    # `input_type` should be set to \"document\" since we are embedding the \"documents\" we want to search\n",
+        "    embedding = <CODE_BLOCK_4>\n",
         "    # Filter for the document where the `_id` field is equal to the `_id` of the current document\n",
         "    filter = {\"_id\": result[\"_id\"]}\n",
         "    # Set the `embedding_field` field to the value `embedding` using the `$set` operator\n",
-        "    update = <CODE_BLOCK_4>\n",
+        "    update = <CODE_BLOCK_5>\n",
         "    # Update the documents in the `collection` collection inplace using the `update_one()` operation\n",
         "    # Get the right document `_id` using the `filter` and apply the `update`\n",
-        "    <CODE_BLOCK_5>"
+        "    <CODE_BLOCK_6>"
       ]
     },
     {
@@ -338,7 +382,7 @@
         "            {\n",
         "                \"type\": \"vector\",\n",
         "                \"path\": \"embedding\",\n",
-        "                \"numDimensions\": 512,\n",
+        "                \"numDimensions\": 1024,\n",
         "                \"similarity\": \"cosine\",\n",
         "            }\n",
         "        ]\n",
@@ -360,7 +404,7 @@
       "outputs": [],
       "source": [
         "# Use the `create_index` function from the `utils` module to create a vector search index with the above definition for the `collection` collection\n",
-        "<CODE_BLOCK_6>"
+        "<CODE_BLOCK_7>"
       ]
     },
     {
@@ -411,17 +455,18 @@
         "    filter (Optional[Dict], optional): Optional vector search pre-filter\n",
         "    \"\"\"\n",
         "    # Generate embedding for the `user_query` using the `get_embedding` function defined in Step 4\n",
-        "    query_embedding = <CODE_BLOCK_7>\n",
+        "    # `input_type` should be set to \"query\" since we are embedding the query\n",
+        "    query_embedding = <CODE_BLOCK_8>\n",
         "\n",
         "    # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage\n",
         "    # Set the number of candidates to 50 and only return the top 5 documents from the vector search\n",
         "    # Set the `filter` field in the $vectorSearch stage to the value `filter` passed to the function\n",
         "    # In the $project stage, exclude the `_id` field, include these fields: `title`, `cover`, `year`, `pages`, and the `vectorSearchScore`\n",
         "    # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage\n",
-        "    pipeline = <CODE_BLOCK_8>\n",
+        "    pipeline = <CODE_BLOCK_9>\n",
         "\n",
         "    # Execute the aggregation `pipeline` and store the results in `results`\n",
-        "    results = <CODE_BLOCK_9>\n",
+        "    results = <CODE_BLOCK_10>\n",
         "\n",
         "    # Print book title, score, and cover image\n",
         "    for book in results:\n",
@@ -487,7 +532,7 @@
       "outputs": [],
       "source": [
         "# Modify the vector search index `model` from Step 5 to include the `year` field as a `filter` field\n",
-        "model = <CODE_BLOCK_10>"
+        "model = <CODE_BLOCK_11>"
       ]
     },
     {
@@ -524,7 +569,7 @@
       "outputs": [],
       "source": [
         "# Create a filter definition to filter for books where the `year` field is greater than `2002` using the `$gte` operator\n",
-        "filter = <CODE_BLOCK_11>\n",
+        "filter = <CODE_BLOCK_12>\n",
         "# Pass the `filter` as an argument to the `vector_search` function.\n",
         "# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n",
         "vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)"
@@ -546,7 +591,7 @@
       "outputs": [],
       "source": [
         "# Modify the vector search index `model` from Step 5 to include `year` and `pages` as filter fields\n",
-        "model = <CODE_BLOCK_12>"
+        "model = <CODE_BLOCK_13>"
       ]
     },
     {
@@ -584,7 +629,7 @@
       "source": [
         "# Create a filter definition to filter for books where the `year` field is greater than or equal to `2002` and the `pages` field is less than or equal to 250\n",
         "# Use the `$gte` and `$lte` operators\n",
-        "filter = <CODE_BLOCK_13>\n",
+        "filter = <CODE_BLOCK_14>\n",
         "# Pass the `filter` as an argument to the `vector_search` function.\n",
         "# Notice how this filter is incorporated in the `pipeline` in the `vector_search` function.\n",
         "vector_search(\"A peaceful and uplifting atmosphere\", \"text\", filter)"
@@ -594,62 +639,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "# Step 8: Changing the similarity metric"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Modify the vector search index `model` from Step 5 to change the similarity metric to `dotProduct`\n",
-        "model = <CODE_BLOCK_14>"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Use the `create_index` function from the `utils` module to re-create the vector search index with the modified model\n",
-        "create_index(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME, model)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Use the `check_index_ready` function from the `utils` module to verify that the index definition has the correct similarity metric and is in READY status before proceeding\n",
-        "check_index_ready(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Perform a vector search\n",
-        "# Note any differences in the results due to the different similarity metric\n",
-        "vector_search(\"A peaceful and uplifting atmosphere\", \"text\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# 🦹‍♀️ Enable vector quantization\n",
+        "# Step 8: Enable vector quantization\n",
         "\n",
         "📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax"
       ]
@@ -742,7 +732,7 @@
         "            {\n",
         "                \"type\": \"vector\",\n",
         "                \"path\": \"embedding\",\n",
-        "                \"numDimensions\": 512,\n",
+        "                \"numDimensions\": 1024,\n",
         "                \"similarity\": \"cosine\",\n",
         "            }\n",
         "        ]\n",
@@ -794,7 +784,7 @@
         "            \"$vectorSearch\": {\n",
         "                \"index\": ATLAS_VECTOR_SEARCH_INDEX_NAME,\n",
         "                \"path\": \"embedding\",\n",
-        "                \"queryVector\": get_embedding(user_query, \"text\"),\n",
+        "                \"queryVector\": get_embedding(user_query, \"text\", \"query\"),\n",
         "                \"numCandidates\": 50,\n",
         "                \"limit\": 10,\n",
         "            }\n",
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,6 @@ langchain-openai==0.3.16
 langgraph==0.4.2
 langgraph-checkpoint-mongodb==0.1.3
 tiktoken==0.9.0
-sentence_transformers==4.1.0
+voyageai==0.3.4
 tqdm==4.67.1
 Pillow==11.1.0
diff --git a/solved_notebooks/vector-search-lab.ipynb b/solved_notebooks/vector-search-lab.ipynb