updates after review

TheovanKraay · TheovanKraay · commit 7c0b13aec580 · 2024-07-10T19:04:53.000+01:00
diff --git a/Python/CosmosDB-NoSQL_VectorSearch/CosmosDB-NoSQL-Quickstart-RAG-Chatbot.ipynb b/Python/CosmosDB-NoSQL_VectorSearch/CosmosDB-NoSQL-Quickstart-RAG-Chatbot.ipynb
@@ -74,7 +74,7 @@
     "import gradio as gr\n",
     "\n",
     "# Cosmos DB imports\n",
-    "from azure.cosmos.aio import CosmosClient\n",
+    "from azure.cosmos import CosmosClient\n",
     "\n",
     "# Load configuration\n",
     "env_name = \"sample_env_file.env\"\n",
@@ -89,7 +89,7 @@
     "cosmos_cache = config['cosmos_cache_collection_name']\n",
     "\n",
     "# Create the Azure Cosmos DB for NoSQL async client for faster data loading\n",
-    "cosmos_async_client = CosmosClient(url=cosmos_conn, credential=cosmos_key)\n",
+    "cosmos_client = CosmosClient(url=cosmos_conn, credential=cosmos_key)\n",
     "\n",
     "openai_endpoint = config['openai_endpoint']\n",
     "openai_key = config['openai_key']\n",
@@ -122,7 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "db = await cosmos_async_client.create_database_if_not_exists(cosmos_database)\n",
+    "db = cosmos_client.create_database_if_not_exists(cosmos_database)\n",
     "\n",
     "# Create the vector embedding policy to specify vector details\n",
     "vector_embedding_policy = {\n",
@@ -146,29 +146,28 @@
     "    ]\n",
     "} \n",
     "\n",
-    "\n",
     "# Create the data collection with vector index (note: this creates a container with 10000 RUs to allow fast data load)\n",
     "try:\n",
-    "    container = await db.create_container_if_not_exists( id=cosmos_collection, \n",
+    "    movies_container = db.create_container_if_not_exists(id=cosmos_collection, \n",
     "                                                  partition_key=PartitionKey(path='/id'), \n",
     "                                                  vector_embedding_policy=vector_embedding_policy,\n",
     "                                                  offer_throughput=10000) \n",
-    "    print('Container with id \\'{0}\\' created'.format(id)) \n",
+    "    print('Container with id \\'{0}\\' created'.format(movies_container.id)) \n",
     "\n",
     "except exceptions.CosmosHttpResponseError: \n",
-    "        raise \n",
+    "    raise \n",
     "\n",
     "# Create the cache collection with vector index\n",
     "try:\n",
-    "    cache_container = await db.create_container_if_not_exists( id=cosmos_cache, \n",
+    "    cache_container = db.create_container_if_not_exists(id=cosmos_cache, \n",
     "                                                  partition_key=PartitionKey(path='/id'), \n",
     "                                                  indexing_policy=indexing_policy,\n",
     "                                                  vector_embedding_policy=vector_embedding_policy,\n",
     "                                                  offer_throughput=1000) \n",
-    "    print('Container with id \\'{0}\\' created'.format(id)) \n",
+    "    print('Container with id \\'{0}\\' created'.format(cache_container.id)) \n",
     "\n",
     "except exceptions.CosmosHttpResponseError: \n",
-    "        raise "
+    "    raise"
    ]
   },
   {
@@ -222,7 +221,7 @@
    "outputs": [],
    "source": [
     "# Unzip the data file\n",
-    "with zipfile.ZipFile(\"../../DataSet/Movies/MovieLens-4489-256D.zip\", 'r') as zip_ref:\n",
+    "with zipfile.ZipFile(\"../../DataSet/Movies/MovieLens-4489-256D.zip\", 'r') as zip_ref: \n",
     "    zip_ref.extractall(\"/Data\")\n",
     "zip_ref.close()\n",
     "# Load the data file\n",
@@ -262,16 +261,12 @@
    "outputs": [],
    "source": [
     "import asyncio\n",
-    "import nest_asyncio\n",
-    "import time  # Import the time module to measure execution time\n",
-    "\n",
-    "nest_asyncio.apply()\n",
+    "import time\n",
+    "from concurrent.futures import ThreadPoolExecutor\n",
     "\n",
-    "def generate_vectors(items, vector_property):\n",
+    "async def generate_vectors(items, vector_property):\n",
     "    for item in items:\n",
-    "        #print(f\"generating embedding for item {item}...\")\n",
-    "        vectorArray = generate_embeddings(item['overview'])\n",
-    "        time.sleep(0.1)\n",
+    "        vectorArray = await generate_embeddings(item['overview'])\n",
     "        item[vector_property] = vectorArray\n",
     "    return items\n",
     "\n",
@@ -282,22 +277,20 @@
     "    tasks = []\n",
     "    max_concurrency = 20  # Adjust this value to control the level of concurrency\n",
     "    semaphore = asyncio.Semaphore(max_concurrency)\n",
-    "    \n",
-    "    await cosmos_async_client.__aenter__()\n",
     "    print(\"Starting doc load, please wait...\")\n",
     "    \n",
+    "    def upsert_item_sync(obj):\n",
+    "        movies_container.upsert_item(body=obj)\n",
+    "    \n",
     "    async def upsert_object(obj):\n",
     "        nonlocal counter\n",
     "        async with semaphore:\n",
-    "            #The following code to create vector embeddings for the data is commented out as the sample data is already vectorized.\n",
-    "            #vectorArray = generate_embeddings(obj['overview'])\n",
-    "            #obj[cosmos_vector_property] = vectorArray\n",
-    "            await container.upsert_item(body=obj)\n",
+    "            await asyncio.get_event_loop().run_in_executor(None, upsert_item_sync, obj)\n",
     "            # Progress reporting\n",
     "            counter += 1\n",
     "            if counter % 100 == 0:\n",
-    "                print(f\"Sent {counter} documents for insertion into collection.\") \n",
-    "                \n",
+    "                print(f\"Sent {counter} documents for insertion into collection.\")\n",
+    "    \n",
     "    for obj in data:\n",
     "        tasks.append(asyncio.create_task(upsert_object(obj)))\n",
     "    \n",
@@ -310,29 +303,8 @@
     "    print(f\"Time taken: {duration:.2f} seconds ({duration:.3f} milliseconds)\")\n",
     "\n",
     "# Run the async function\n",
-    "await insert_data()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "590df42f-5416-4744-9968-5406730ed036",
-   "metadata": {},
-   "source": [
-    "# Set up containers for chat bot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70b0cb9e-14d6-48ef-bca4-76b626572bc3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from azure.cosmos import CosmosClient\n",
-    "cosmos_sync_client = CosmosClient(url=cosmos_conn, credential=cosmos_key)\n",
-    "db = cosmos_sync_client.get_database_client(cosmos_database)\n",
-    "movies_container = db.get_container_client(cosmos_collection)\n",
-    "cache_container = db.get_container_client(cosmos_cache)"
+    "await insert_data()\n",
+    " "
    ]
   },
   {