Merge pull request #131 from Azure-Samples/matt/update-sample

mattgotteiner · web-flow · commit cdef4761a89e · 2024-10-03T13:46:28.000-07:00
Update tutorial sample
diff --git a/Tutorial-RAG/Tutorial-rag-requirements.txt b/Tutorial-RAG/Tutorial-rag-requirements.txt
@@ -4,4 +4,6 @@ azure-search-documents==11.5.1
 azure-storage-blob
 azure-identity
 openai
-aiohttp
+aiohttp
+ipywidgets
+ipykernel
diff --git a/Tutorial-RAG/Tutorial-rag.ipynb b/Tutorial-RAG/Tutorial-rag.ipynb
@@ -96,8 +96,8 @@
     "\n",
     "Deploy the following models on Azure OpenAI:\n",
     "\n",
-    "- Text-embedding-ada-02 on Azure OpenAI for embeddings\n",
-    "- GPT-35-Turbo on Azure OpenAI for chat completion\n",
+    "- text-embedding-3-large on Azure OpenAI for embeddings\n",
+    "- gpt-4o on Azure OpenAI for chat completion\n",
     "\n",
     "You must have [**Cognitive Services OpenAI Contributor**]( /azure/ai-services/openai/how-to/role-based-access-control#cognitive-services-openai-contributor) or higher to deploy models in Azure OpenAI.\n",
     "\n",
@@ -107,17 +107,17 @@
     "\n",
     "1. Select **Deploy model** > **Deploy base model**.\n",
     "\n",
-    "1. Select **text-embedding-ada-02** from the dropdown list and confirm the selection.\n",
+    "1. Select **text-embedding-3-large** from the dropdown list and confirm the selection.\n",
     "\n",
-    "1. Specify a deployment name. We recommend \"text-embedding-ada-002\".\n",
+    "1. Specify a deployment name. We recommend \"text-embedding-3-large\".\n",
     "\n",
     "1. Accept the defaults.\n",
     "\n",
     "1. Select **Deploy**.\n",
     "\n",
-    "1. Repeat the previous steps for **gpt-35-turbo**.\n",
+    "1. Repeat the previous steps for **gpt-4o**.\n",
     "\n",
-    "Make a note of the model names and endpoint. Embedding skills and vectorizers assemble the full endpoint internally, so you only need the resource URI. For example, given `https://MY-FAKE-ACCOUNT.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15`, the endpoint you should provide in skill and vectorizer definitions is `https://MY-FAKE-ACCOUNT.openai.azure.com`.\n",
+    "Make a note of the model names and endpoint. Embedding skills and vectorizers assemble the full endpoint internally, so you only need the resource URI. For example, given `https://MY-FAKE-ACCOUNT.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2024-06-01`, the endpoint you should provide in skill and vectorizer definitions is `https://MY-FAKE-ACCOUNT.openai.azure.com`.\n",
     "\n",
     "### Configure search engine role-based access to Azure Storage\n",
     "\n",
@@ -192,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -249,7 +249,7 @@
     "    SearchField(name=\"locations\", type=SearchFieldDataType.Collection(SearchFieldDataType.String), filterable=True),\n",
     "    SearchField(name=\"chunk_id\", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name=\"keyword\"),  \n",
     "    SearchField(name=\"chunk\", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  \n",
-    "    SearchField(name=\"text_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name=\"myHnswProfile\")\n",
+    "    SearchField(name=\"text_vector\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1024, vector_search_profile_name=\"myHnswProfile\")\n",
     "    ]  \n",
     "  \n",
     "# Configure the vector search configuration  \n",
@@ -270,8 +270,8 @@
     "            kind=\"azureOpenAI\",  \n",
     "            parameters=AzureOpenAIVectorizerParameters(  \n",
     "                resource_url=AZURE_OPENAI_ACCOUNT,  \n",
-    "                deployment_name=\"text-embedding-ada-002\",\n",
-    "                model_name=\"text-embedding-ada-002\"\n",
+    "                deployment_name=\"text-embedding-3-large\",\n",
+    "                model_name=\"text-embedding-3-large\"\n",
     "            ),\n",
     "        ),  \n",
     "    ], \n",
@@ -294,9 +294,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data source 'py-rag-tutorial-ds' created or updated\n"
+     ]
+    }
+   ],
    "source": [
     "from azure.search.documents.indexes import SearchIndexerClient\n",
     "from azure.search.documents.indexes.models import (\n",
@@ -306,7 +314,7 @@
     "\n",
     "# Create a data source \n",
     "indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential)\n",
-    "container = SearchIndexerDataContainer(name=\"nasa-ebooks-pdfs-all\")\n",
+    "container = SearchIndexerDataContainer(name=\"nasatext\")\n",
     "data_source_connection = SearchIndexerDataSourceConnection(\n",
     "    name=\"py-rag-tutorial-ds\",\n",
     "    type=\"azureblob\",\n",
@@ -329,9 +337,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "py-rag-tutorial-ss created\n"
+     ]
+    }
+   ],
    "source": [
     "from azure.search.documents.indexes.models import (\n",
     "    SplitSkill,\n",
@@ -368,9 +384,9 @@
     "    description=\"Skill to generate embeddings via Azure OpenAI\",  \n",
     "    context=\"/document/pages/*\",  \n",
     "    resource_url=AZURE_OPENAI_ACCOUNT,  \n",
-    "    deployment_name=\"text-embedding-ada-002\",  \n",
-    "    model_name=\"text-embedding-ada-002\",\n",
-    "    dimensions=1536,\n",
+    "    deployment_name=\"text-embedding-3-large\",  \n",
+    "    model_name=\"text-embedding-3-large\",\n",
+    "    dimensions=1024,\n",
     "    inputs=[  \n",
     "        InputFieldMappingEntry(name=\"text\", source=\"/document/pages/*\"),  \n",
     "    ],  \n",
@@ -439,13 +455,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " py-rag-tutorial-idxr is created and running. Give the indexer a few minutes before running a query.\n"
+     ]
+    }
+   ],
    "source": [
     "from azure.search.documents.indexes.models import (\n",
-    "    SearchIndexer,\n",
-    "    FieldMapping\n",
+    "    SearchIndexer\n",
     ")\n",
     "\n",
     "# Create an indexer  \n",
@@ -459,8 +482,6 @@
     "    skillset_name=skillset_name,  \n",
     "    target_index_name=index_name,  \n",
     "    data_source_name=data_source.name,\n",
-    "    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  \n",
-    "    field_mappings=[FieldMapping(source_field_name=\"metadata_storage_name\", target_field_name=\"title\")],\n",
     "    parameters=indexer_parameters\n",
     ")  \n",
     "\n",
@@ -482,31 +503,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Score: 0.01666666753590107\n",
+      "Chunk: national Aeronautics and Space Administration\n",
+      "\n",
+      "earth Science\n",
+      "\n",
+      "NASA Headquarters \n",
+      "\n",
+      "300 E Street SW \n",
+      "\n",
+      "Washington, DC 20546\n",
+      "\n",
+      "www.nasa.gov\n",
+      "\n",
+      "np-2018-05-2546-hQ\n"
+     ]
+    }
+   ],
    "source": [
     "from azure.search.documents import SearchClient\n",
     "from azure.search.documents.models import VectorizableTextQuery\n",
     "\n",
     "# Vector Search using text-to-vector conversion of the querystring\n",
-    "query = \"where are NASA's headquarters located?\"  \n",
+    "query = \"what's NASA's website?\"  \n",
     "\n",
     "search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential, index_name=index_name)\n",
-    "vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields=\"text_vector\", exhaustive=True)\n",
+    "vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields=\"text_vector\")\n",
     "  \n",
     "results = search_client.search(  \n",
     "    search_text=query,  \n",
     "    vector_queries= [vector_query],\n",
-    "    select=[\"parent_id\", \"chunk_id\", \"title\", \"chunk\", \"locations\"],\n",
+    "    select=[\"chunk\"],\n",
     "    top=1\n",
     ")  \n",
     "  \n",
     "for result in results:  \n",
     "    print(f\"Score: {result['@search.score']}\")\n",
-    "    print(f\"Title: {result['title']}\")\n",
-    "    print(f\"Locations: {result['locations']}\")\n",
-    "    print(f\"Content: {result['chunk']}\")"
+    "    print(f\"Chunk: {result['chunk']}\")"
    ]
   },
   {
@@ -522,7 +562,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -537,7 +577,7 @@
     "     azure_ad_token_provider=token_provider\n",
     " )\n",
     "\n",
-    "deployment_name = \"gpt-35-turbo\"\n",
+    "deployment_name = \"gpt-4o\"\n",
     "\n",
     "search_client = SearchClient(\n",
     "     endpoint=AZURE_SEARCH_SERVICE,\n",
@@ -551,28 +591,58 @@
     "Answer the query using only the sources provided below.\n",
     "Use bullets if the answer has multiple points.\n",
     "If the answer is longer than 3 sentences, provide a summary.\n",
-    "Answer ONLY with the facts listed in the list of sources below.\n",
+    "Answer ONLY with the facts listed in the list of sources below. Cite your source when you answer the question\n",
     "If there isn't enough information below, say you don't know.\n",
     "Do not generate answers that don't use the sources below.\n",
     "Query: {query}\n",
     "Sources:\\n{sources}\n",
-    "\"\"\"\n",
-    "\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Provide the query. Notice it's sent to both the search engine and the LLM.\n",
     "# The query sent to the search engine is hybrid. Keyword search on \"query\". Text-to-vector conversion for vector search.\n",
-    "query=\"how much of earth is covered by water\"\n",
-    "vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields=\"text_vector\", exhaustive=True)\n",
+    "query=\"What's the NASA earth book about?\"\n",
+    "vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=50, fields=\"text_vector\")\n",
     "\n",
     "# Set up the search results and the chat thread.\n",
     "# Retrieve the selected fields from the search index related to the question.\n",
     "search_results = search_client.search(\n",
     "    search_text=query,\n",
     "    vector_queries= [vector_query],\n",
-    "    select=\"title, chunk, locations\",\n",
-    "    top=1,\n",
+    "    select=[\"title\", \"chunk\", \"locations\"],\n",
+    "    top=5,\n",
     ")\n",
-    "sources_formatted = \"\\n\".join([f'{document[\"title\"]}:{document[\"chunk\"]}:{document[\"locations\"]}' for document in search_results])\n",
     "\n",
+    "# Newlines could be in the OCR'd content. Use a unique separator to make the sources distinct\n",
+    "sources_formatted = \"=================\\n\".join([f'TITLE: {document[\"title\"]}, CONTENT: {document[\"chunk\"]}, LOCATIONS: {document[\"locations\"]}' for document in search_results])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The NASA Earth book stands at the intersection of science and art, using NASA's unique vantage point and tools to study Earth’s physical processes from beneath the crust to the edge of the atmosphere. It presents Earth as a dynamic system, examining cycles and processes such as the water cycle, carbon cycle, ocean circulation, and the movement of heat. The book uses images to tell the story of Earth's land, wind, water, ice, and air as seen from above, showcasing the planet’s diverse colors, textures, and shapes.\n",
+      "\n",
+      "- It aims to inspire by presenting a 4.5-billion-year-old planet through striking images.\n",
+      "- The book highlights how light is observed and studied, reflecting NASA’s scientific pursuits and artistic sensibilities.\n",
+      "- It emphasizes the awe-inspiring beauty of Earth, which NASA captures from space.\n",
+      "\n",
+      "(Source: page-8.pdf)\n"
+     ]
+    }
+   ],
+   "source": [
     "response = openai_client.chat.completions.create(\n",
     "    messages=[\n",
     "        {\n",