add updates to dormant code that vectorizes raw data

TheovanKraay · TheovanKraay · commit 4e9cdbc9cafd · 2024-09-30T19:41:08.000+01:00
diff --git a/Python/CosmosDB-NoSQL_VectorSearch/CosmosDB-NoSQL-Quickstart-RAG-Chatbot.ipynb b/Python/CosmosDB-NoSQL_VectorSearch/CosmosDB-NoSQL-Quickstart-RAG-Chatbot.ipynb
@@ -265,8 +265,9 @@
    "outputs": [],
    "source": [
     "#The following code to get raw movies data is commented out in favour of getting pre-vectorised data\n",
+    "#If you want to vectorize the raw data from storage_file_url, uncomment the below, and set vectorizeFlag=True\n",
     "#data = urllib.request.urlopen(storage_file_url)\n",
-    "#data = json.load(data)"
+    "#data = json.load(data)\n"
    ]
   },
   {
@@ -276,19 +277,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "vectorizeFlag=False\n",
+    "\n",
     "import asyncio\n",
     "import time\n",
     "from concurrent.futures import ThreadPoolExecutor\n",
     "\n",
     "async def generate_vectors(items, vector_property):\n",
-    "    for item in items:\n",
-    "        vectorArray = await generate_embeddings(item['overview'])\n",
-    "        item[vector_property] = vectorArray\n",
+    "    # Create a thread pool executor for the synchronous generate_embeddings\n",
+    "    loop = asyncio.get_event_loop()\n",
+    "    \n",
+    "    # Define a function to call generate_embeddings using run_in_executor\n",
+    "    async def generate_embedding_for_item(item):\n",
+    "        try:\n",
+    "            # Offload the sync generate_embeddings to a thread\n",
+    "            vectorArray = await loop.run_in_executor(None, generate_embeddings, item['overview'])\n",
+    "            item[vector_property] = vectorArray\n",
+    "        except Exception as e:\n",
+    "            # Log or handle exceptions if needed\n",
+    "            logging.error(f\"Error generating embedding for item: {item['overview'][:50]}...\", exc_info=True)\n",
+    "    \n",
+    "    # Create tasks for all the items to generate embeddings concurrently\n",
+    "    tasks = [generate_embedding_for_item(item) for item in items]\n",
+    "    \n",
+    "    # Run all the tasks concurrently and wait for their completion\n",
+    "    await asyncio.gather(*tasks)\n",
+    "    \n",
     "    return items\n",
     "\n",
-    "async def insert_data():\n",
+    "async def insert_data(vectorize=False):\n",
     "    start_time = time.time()  # Record the start time\n",
     "    \n",
+    "    # If vectorize flag is True, generate vectors for the data\n",
+    "    if vectorize:\n",
+    "        print(\"Vectorizing data, please wait...\")\n",
+    "        global data\n",
+    "        data = await generate_vectors(data, \"vector\")\n",
+    "\n",
     "    counter = 0\n",
     "    tasks = []\n",
     "    max_concurrency = 5  # Adjust this value to control the level of concurrency\n",
@@ -318,9 +343,8 @@
     "    print(f\"All {counter} documents inserted!\")\n",
     "    print(f\"Time taken: {duration:.2f} seconds ({duration:.3f} milliseconds)\")\n",
     "\n",
-    "# Run the async function\n",
-    "await insert_data()\n",
-    " "
+    "# Run the async function with the vectorize flag set to True or False as needed\n",
+    "await insert_data(vectorizeFlag)  # or await insert_data() for default\n"
    ]
   },
   {