Added HNSW example in cosmos db for mongodb vcore example

jcodella · jcodella · commit da7e0433ceaa · 2023-11-27T16:02:53.000-05:00
diff --git a/Python/CosmosDB-MongoDB-vCore/CosmosDB-MongoDB-vCore_AzureOpenAI_Tutorial.ipynb b/Python/CosmosDB-MongoDB-vCore/CosmosDB-MongoDB-vCore_AzureOpenAI_Tutorial.ipynb
@@ -80,6 +80,7 @@
     "COSMOS_MONGO_USER = config['cosmos_db_mongo_user']\n",
     "COSMOS_MONGO_PWD = config['cosmos_db_mongo_pwd']\n",
     "COSMOS_MONGO_SERVER = config['cosmos_db_mongo_server']\n",
+    "\n",
     "openai.api_type = config['openai_api_type']\n",
     "openai.api_key = config['openai_api_key']\n",
     "openai.api_base = config['openai_api_endpoint']\n",
@@ -118,7 +119,7 @@
    "metadata": {},
    "source": [
     "# Load data and create embeddings <a class=\"anchor\" id=\"loaddata\"></a>\n",
-    "Here we'll load a sample dataset containing descriptions of Azure services. Then we'll user Azure OpenAI to create vector embeddings from this data."
+    "Here we load a sample dataset containing descriptions of Azure services, then we use Azure OpenAI to create vector embeddings from this data."
    ]
   },
   {
@@ -127,10 +128,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load text-sample.json data file\n",
+    "# Load text-sample.json data file. Embeddings will need to be generated using the function below.\n",
     "#data_file = open(file=\"../../DataSet/AzureServices/text-sample.json\", mode=\"r\")\n",
     "\n",
-    "# Load this file instead if embeddings were previously created and saved.\n",
+    "# OR Load text-sample_w_embeddings.json which has embeddings pre-computed\n",
     "data_file = open(file=\"../../DataSet/AzureServices/text-sample_w_embeddings.json\", mode=\"r\") \n",
     "data = json.load(data_file)\n",
     "data_file.close()"
@@ -185,7 +186,6 @@
     "    item['contentVector'] = content_embeddings\n",
     "    item['@search.action'] = 'upload'\n",
     "    print(\"Creating embeddings for item:\", n, \"/\" ,len(data), end='\\r')\n",
-    "\n",
     "# Save embeddings to sample_text_w_embeddings.json file\n",
     "with open(\"../../DataSet/AzureServices/text-sample_w_embeddings.json\", \"w\") as f:\n",
     "    json.dump(data, f)"
@@ -215,17 +215,6 @@
     "mongo_client = pymongo.MongoClient(mongo_conn)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Use only if re-reunning code and want to reset db and collection\n",
-    "collection.drop_index(\"vectorSearchIndex\")\n",
-    "mongo_client.drop_database(\"TutorialDB\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -240,10 +229,10 @@
    "outputs": [],
    "source": [
     "# create a database called TutorialDB\n",
-    "db = mongo_client['TutorialDB']\n",
+    "db = mongo_client['ExampleDB']\n",
     "\n",
     "# Create collection if it doesn't exist\n",
-    "COLLECTION_NAME = \"TutorialCol\"\n",
+    "COLLECTION_NAME = \"ExampleCollection\"\n",
     "\n",
     "collection = db[COLLECTION_NAME]\n",
     "\n",
@@ -255,11 +244,32 @@
     "    print(\"Using collection: '{}'.\\n\".format(COLLECTION_NAME))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Use only if re-reunning code and want to reset db and collection\n",
+    "collection.drop_index(\"VectorSearchIndex\")\n",
+    "mongo_client.drop_database(\"ExampleDB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create the vector index\n",
+    "\n",
+    "**IMPORTANT: You can only create one index per vector property.** That is, you cannot create more than one index that points to the same vector property. If you want to change the index type (e.g., from IVF to HNSW) you must drop the index first before creating a new index."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Create the vector index"
+    "### IVF\n",
+    "IVF is the default vector indexing algorithm, which works on all cluster tiers. It's an approximate nerarest neighbors (ANN) approach that uses clustering to speeding up the search for similar vectors in a dataset. "
    ]
   },
   {
@@ -269,10 +279,10 @@
    "outputs": [],
    "source": [
     "db.command({\n",
-    "  'createIndexes': 'TutorialCol',\n",
+    "  'createIndexes': 'ExampleCollection',\n",
     "  'indexes': [\n",
     "    {\n",
-    "      'name': 'vectorSearchIndex',\n",
+    "      'name': 'VectorSearchIndex',\n",
     "      'key': {\n",
     "        \"contentVector\": \"cosmosSearch\"\n",
     "      },\n",
@@ -284,7 +294,46 @@
     "      }\n",
     "    }\n",
     "  ]\n",
-    "});"
+    "})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### HNSW (preview)\n",
+    "\n",
+    "HNSW stands for Hierarchical Navigable Small World, a graph-based data structure that partitions vectors into clusters and subclusters. With HNSW, you can perform fast approximate nearest neighbor search at higher speeds with greater accuracy. HNSW is an approximate (ANN) method. As a preview feature, this must be enabled using Azure Feature Enablement Control (AFEC) by selecting the \"mongoHnswIndex\" feature. For more information, see [enable preview features](https://learn.microsoft.com/azure/azure-resource-manager/management/preview-features).\n",
+    "\n",
+    "HNSW works on M50 cluster tiers and higher while in preview."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db.command(\n",
+    "{ \n",
+    "    \"createIndexes\": \"ExampleCollection\",\n",
+    "    \"indexes\": [\n",
+    "        {\n",
+    "            \"name\": \"VectorSearchIndex\",\n",
+    "            \"key\": {\n",
+    "                \"contentVector\": \"cosmosSearch\"\n",
+    "            },\n",
+    "            \"cosmosSearchOptions\": { \n",
+    "                \"kind\": \"vector-hnsw\", \n",
+    "                \"m\": 16, # default value \n",
+    "                \"efConstruction\": 64, # default value \n",
+    "                \"similarity\": \"COS\", \n",
+    "                \"dimensions\": 1536\n",
+    "            } \n",
+    "        } \n",
+    "    ] \n",
+    "}\n",
+    ")"
    ]
   },
   {
@@ -327,7 +376,7 @@
     "                \"cosmosSearch\": {\n",
     "                    \"vector\": query_embedding,\n",
     "                    \"path\": \"contentVector\",\n",
-    "                    \"k\": num_results\n",
+    "                    \"k\": num_results #, \"efsearch\": 40 # optional for HNSW only \n",
     "                },\n",
     "                \"returnStoredSource\": True }},\n",
     "        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } }\n",
@@ -346,12 +395,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "query = \"What services do you have?\"\n",
+    "query = \"What are the services for running ML models?\"\n",
     "results = vector_search(query)\n",
     "for result in results: \n",
+    "#     print(result)\n",
     "    print(f\"Similarity Score: {result['similarityScore']}\")  \n",
     "    print(f\"Title: {result['document']['title']}\")  \n",
     "    print(f\"Content: {result['document']['content']}\")  \n",
@@ -411,11 +463,11 @@
     "user_input = input(\"Prompt: \")\n",
     "while user_input.lower() != \"end\":\n",
     "    results_for_prompt = vector_search(user_input)\n",
-    "    print(f\"User Prompt: {user_input}\")\n",
+    "   # print(f\"User Prompt: {user_input}\")\n",
     "    completions_results = generate_completion(results_for_prompt)\n",
     "    print(\"\\n\")\n",
     "    print(completions_results['choices'][0]['message']['content'])\n",
-    "    user_input = input(\"Prompt: \")"
+    "    user_input = input(\"Prompt: \")\n"
    ]
   }
  ],