diff --git a/notebooks/enterprise-search/app-search-engine-exporter.ipynb b/notebooks/enterprise-search/app-search-engine-exporter.ipynb index c460c80a..fe049e78 100644 --- a/notebooks/enterprise-search/app-search-engine-exporter.ipynb +++ b/notebooks/enterprise-search/app-search-engine-exporter.ipynb @@ -12,6 +12,8 @@ "\n", "This notebook explains the steps of exporting an App Search engine together with its configurations in Elasticsearch. This is not meant to be an exhaustive example for all App Search features as those will vary based on your instance, but is meant to give a sense of how you can export, migrate, and enhance your application.\n", "\n", + "NOTE: This notebook is designed to work with Elasticsearch **8.18** or higher. If you are running this notebook against an older version of Elasticsearch, we note commands that will need to be modified.\n", + "\n", "We will look at:\n", "\n", "- how to export synonyms\n", @@ -57,7 +59,7 @@ "source": [ "## Connect to Elasticsearch\n", "\n", - "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?onboarding_token=search&utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial. \n", + "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?onboarding_token=search&utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial. This notebook is designed to be run against an Elasticsearch deployment running on version 8.18 or higher.\n", "\n", "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment. \n", "\n", @@ -66,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -95,12 +97,12 @@ "\n", "You can find your App Search endpoint and your search private key from the `Credentials` menu inside your App Search instance in Kibana.\n", "\n", - "Also note here, we define our `ENGINE_NAME`. For this examplem we are using the `national-parks-demo` sample engine that is available within App Search." + "Also note here, we define our `ENGINE_NAME`. For this example, we are using the `national-parks-demo` sample engine that is available within App Search." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -129,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "kpV8K5jHvRK6" }, @@ -173,9 +175,9 @@ "\n", "Next, we will export any curations that may be in our App Search engine.\n", "\n", - "To export App Search curations we will use Elasticsearch [query rules](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-using-query-rules.html).\n", - "At the moment of writing this notebook Elasticsearch query rules only allow for pinning results unlike App Search curations that also allow excluding results.\n", - "For this reason we will only export pinned results. The code below will create the necessary `query_rules` to achieve this. Note that there is a default soft limit of 100 curations for `query_rules` that can be configured up to a hard limit of 1,000." + "To export App Search curations we will use Elasticsearch [query rules](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-using-query-rules.html). The code below will create the necessary `query_rules` to achieve this. Note that there is a default soft limit of 100 curations for `query_rules` that can be configured up to a hard limit of 1,000.\n", + "\n", + "NOTE: This example outputs query rules requiring `exact` matches, which are case-sensitive. If you need typo tolerance, consider using `fuzzy`. If you need different case values consider adding multiple values to your criteria. " ] }, { @@ -187,24 +189,60 @@ "query_rules = []\n", "\n", "for curation in app_search.list_curations(engine_name=ENGINE_NAME).body[\"results\"]:\n", - " query_rules.append(\n", - " {\n", - " \"rule_id\": curation[\"id\"],\n", - " \"type\": \"pinned\",\n", - " \"criteria\": [\n", - " {\n", - " \"type\": \"exact\",\n", - " \"metadata\": \"user_query\",\n", - " \"values\": curation[\"queries\"],\n", - " }\n", - " ],\n", - " \"actions\": {\"ids\": curation[\"promoted\"]},\n", - " }\n", - " )\n", + " if curation[\"promoted\"]:\n", + " query_rules.append(\n", + " {\n", + " \"rule_id\": curation[\"id\"] + \"-pinned\",\n", + " \"type\": \"pinned\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"exact\",\n", + " \"metadata\": \"user_query\",\n", + " \"values\": curation[\"queries\"],\n", + " }\n", + " ],\n", + " \"actions\": {\"ids\": curation[\"promoted\"]},\n", + " }\n", + " )\n", + " if curation[\"hidden\"]:\n", + " query_rules.append(\n", + " {\n", + " \"rule_id\": curation[\"id\"] + \"-exclude\",\n", + " \"type\": \"exclude\",\n", + " \"criteria\": [\n", + " {\n", + " \"type\": \"exact\",\n", + " \"metadata\": \"user_query\",\n", + " \"values\": curation[\"queries\"],\n", + " }\n", + " ],\n", + " \"actions\": {\"ids\": curation[\"hidden\"]},\n", + " }\n", + " )\n", "\n", "elasticsearch.query_rules.put_ruleset(ruleset_id=ENGINE_NAME, rules=query_rules)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a quick look at the query rules we've migrated. We'll do this via the `GET _query_rules/ENGINE_NAME` endpoint. Note that curations with both pinned and hidden documents will be represented as two rules in the ruleset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\n", + " json.dumps(\n", + " elasticsearch.query_rules.get_ruleset(ruleset_id=ENGINE_NAME).body, indent=2\n", + " )\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": { @@ -215,7 +253,15 @@ "\n", "We recommend reindexing your App Search engine data into a new Elasticsearch index instead of reusing the existing one. This allows you to update the index mapping to take advantage of modern features like semantic search and the newly created Elasticsearch synonym set.\n", "\n", - "App Search has the following data types: `text`, `number`, `date` and `geolocation`. Each of these types is mapped to Elasticsearch field types.\n", + "App Search has the following data types:\n", + "\n", + "- `text`\n", + "- `number`\n", + "- `date`\n", + "- `geolocation`\n", + " \n", + "Each of these types is mapped to Elasticsearch field types.\n", + "\n", "We can take a closer look at how App Search field types are mapped to Elasticsearch fields, by using the [`GET mapping API`](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-get-mapping.html).\n", "For App Search engines, the associated Elasticsearch index name is `.ent-search-engine-documents-[ENGINE_NAME]`, e.g. `.ent-search-engine-documents-national-parks-demo` for the App Search sample engine `national-parks-demo`.\n", "One thing to notice is how App Search uses [multi-fields](https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html) in Elasticsearch that allow for quickly changing the field type in App Search without requiring reindexing by creating subfields for each type of supported field:\n", @@ -578,38 +624,11 @@ "source": [ "# Add semantic text fields for semantic search (optional)\n", "\n", - "One of the advantages of exporting our index directly to Elasticsearch is that we can easily perform semantic search with ELSER. To do this, we'll need to add an inference endpoint using ELSER, and a `semantic_text` field to our index to use it.\n", + "One of the advantages of exporting our index directly to Elasticsearch is that we can easily perform semantic search with ELSER. To do this, we'll need to add a `semantic_text` field to our index to use it. We will set up a `semantic_text` field using our default ELSER endpoint.\n", "\n", - "Note that to use this feature, your cluster must have at least one ML node set up with enough resources allocated to it.\n", + "Note that to use this feature, your cluster must be running at least version 8.15.0 and have at least one ML node set up with enough resources allocated to it.\n", "\n", - "If you have not already, be sure that your ELSER v2 model is [setup and deployed](https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html).\n", - "\n", - "Let's first start by creating our inference endpoint using the [Create inference API]](https://www.elastic.co/guide/en/elasticsearch/reference/current/put-inference-api.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# delete our inference endpoint if it is already created\n", - "if elasticsearch.inference.get(inference_id=\"elser_inference_endpoint\"):\n", - " elasticsearch.inference.delete(inference_id=\"elser_inference_endpoint\")\n", - "\n", - "# and create our endpoint using the ELSER v2 model\n", - "elasticsearch.inference.put(\n", - " inference_id=\"elser_inference_endpoint\",\n", - " inference_config={\n", - " \"service\": \"elasticsearch\",\n", - " \"service_settings\": {\n", - " \"model_id\": \".elser_model_2_linux-x86_64\",\n", - " \"num_allocations\": 1,\n", - " \"num_threads\": 1,\n", - " },\n", - " },\n", - " task_type=\"sparse_embedding\",\n", - ")" + "If you do not have an ELSER endpoint running, it will be automatically downloaded, deployed and started for you when you use `semantic_text`. This means the first few commands may take a while as the model loads. For Elasticsearch versions below 8.17, you will need to create an inference endpoint and add it to the `semantic_text` mapping." ] }, { @@ -618,7 +637,7 @@ "source": [ "## Using semantic text fields for ingest and query\n", "\n", - "Next, we'll augment our text fields with `semantic_text` fields in our index. We'll do this by creating a `semtantic_text` field, and providing a `copy_to` directive from the original source field to copy the text into our semantic text fields.\n", + "First, we'll augment our text fields with `semantic_text` fields in our index. We'll do this by creating a `semtantic_text` field, and providing a `copy_to` directive from the original source field to copy the text into our semantic text fields.\n", "\n", "In the example below, we are using the `description` and `title` fields from our example index to add semantic search on those fields." ] @@ -636,10 +655,7 @@ "# add the semantic_text field to our mapping for each field defined\n", "for field_name in SEMANTIC_TEXT_FIELDS:\n", " semantic_field_name = field_name + \"_semantic\"\n", - " mapping[semantic_field_name] = {\n", - " \"type\": \"semantic_text\",\n", - " \"inference_id\": \"elser_inference_endpoint\",\n", - " }\n", + " mapping[semantic_field_name] = {\"type\": \"semantic_text\"}\n", "\n", "# and for our text fields, add a \"copy_to\" directive to copy the text to the semantic_text field\n", "for field_name in SEMANTIC_TEXT_FIELDS:\n", @@ -778,7 +794,7 @@ "\n", "For the results, we sort on our score descending as the primary sort, with the document id as the secondary.\n", "\n", - "We apply highlights to our results, request a return size of the top 10 hits, and for each hit, return the result fields." + "We apply highlights to returned text search descriptions, request a return size of the top 10 hits, and for each hit, return the result fields." ] }, { @@ -826,7 +842,7 @@ " \"order\": \"score\",\n", " \"encoder\": \"html\",\n", " \"require_field_match\": False,\n", - " \"fields\": {},\n", + " \"fields\": {\"description\": {\"pre_tags\": [\"\"], \"post_tags\": [\"\"]}},\n", " },\n", " \"size\": 10,\n", " \"_source\": result_fields,\n", @@ -849,7 +865,7 @@ "outputs": [], "source": [ "results = elasticsearch.search(\n", - " index=SOURCE_INDEX,\n", + " index=DEST_INDEX,\n", " query=app_search_query_payload[\"query\"],\n", " highlight=app_search_query_payload[\"highlight\"],\n", " source=app_search_query_payload[\"_source\"],\n", @@ -866,7 +882,9 @@ "### How to do semantic search using ELSER with semantic text fields\n", "\n", "If you [enabled and reindexed your data with ELSER](#add-sparse_vector-fields-for-semantic-search-optional), we can now use this to do semantic search.\n", - "For each `semantic_text` field type, we can define a [semantic query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-semantic-query.html) to easily perform a semantic search on these fields.\n" + "For each `semantic_text` field type, we can define a [match query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html) to easily perform a semantic search on these fields.\n", + "\n", + "NOTE: For Elasticsearch versions prior to 8.18, a [semantic query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-semantic-query.html) should be used to perform a semantic search on these fields.\n" ] }, { @@ -881,14 +899,7 @@ "\n", "for field_name in SEMANTIC_TEXT_FIELDS:\n", " semantic_field_name = field_name + \"_semantic\"\n", - " semantic_text_queries.append(\n", - " {\n", - " \"semantic\": {\n", - " \"field\": semantic_field_name,\n", - " \"query\": QUERY_STRING,\n", - " }\n", - " }\n", - " )\n", + " semantic_text_queries.append({\"match\": {semantic_field_name: QUERY_STRING}})\n", "\n", "semantic_query = {\"bool\": {\"should\": semantic_text_queries}}\n", "print(f\"Elasticsearch query:\\n{json.dumps(semantic_query, indent=2)}\\n\")" @@ -926,7 +937,7 @@ " \"should\": [\n", " // multi_match query with best_fields from App Search generated query\n", " // multi_match query with cross_fields from App Search generated query\n", - " // text_expansion queries for sparse_vector fields\n", + " // match queries for semantic_text fields\n", " ]\n", " }\n", " } \n", @@ -960,7 +971,7 @@ "outputs": [], "source": [ "results = elasticsearch.search(\n", - " index=SOURCE_INDEX,\n", + " index=DEST_INDEX,\n", " query=payload[\"query\"],\n", " highlight=payload[\"highlight\"],\n", " source=payload[\"_source\"],\n", @@ -969,7 +980,7 @@ " min_score=1,\n", ")\n", "\n", - "print(f\"Text expansion query results:\\n{json.dumps(results.body, indent=2)}\\n\")" + "print(f\"Semantic query results:\\n{json.dumps(results.body, indent=2)}\\n\")" ] } ], diff --git a/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb new file mode 100644 index 00000000..7861aeae --- /dev/null +++ b/supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb @@ -0,0 +1,480 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "# Overview\n", + "This notebook provides the following: \n", + "\n", + "1. Parses PDFs with [Azure Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence/) that have text and tables. Each PDF is saved as a JSON file so that it can be loaded into elastic. \n", + "2. Loads JSON files into Elasticsearch. This notebook uses the elasticsearch python client to create an index with E5 and ELSER semantic_text mappings. \n", + "3. Once the data is loaded into Elasticsearch, you can ask questions in Playground and get answers grounded in truth. The index \"id\" field uses the following naming convention: PDF_FILENAME.pdf_PAGENUMBER. That allows you to see PDF and page number in the \"document sources\" link.\n", + "\n", + "**This notebook cannot be used to parse PDF images.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Install python dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install elasticsearch python-dotenv tqdm azure-core azure-ai-documentintelligence requests httpx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a .env file that has the following entries. \n", + "\n", + "## Elasticsearch \n", + "- You must have a functional elasticsearch environment that has an `enterprise` level license\n", + "- The fastest way to get up and running is to use the [Elastic Serverless - Get started](https://www.elastic.co/guide/en/serverless/current/elasticsearch-get-started.html) guide\n", + "\n", + "```\n", + "ES_URL=?\n", + "ES_API_KEY=?\n", + "```\n", + "\n", + "## Azure AI Document Intelligence\n", + "\n", + "```\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT=?\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY=?\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create input and output folders\n", + "\n", + "- /pdf - place your PDF files in this input folder\n", + "- /json - parser will output one json file for each pdf in this output folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "input_folder_pdf = \"./pdf\"\n", + "output_folder_pdf = \"./json\"\n", + "\n", + "folders = [input_folder_pdf, output_folder_pdf]\n", + "\n", + "\n", + "def create_folders_if_not_exist(folders):\n", + " for folder in folders:\n", + " os.makedirs(folder, exist_ok=True)\n", + " print(f\"Folder '{folder}' created or already exists.\")\n", + "\n", + "\n", + "create_folders_if_not_exist(folders)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download PDF files\n", + "\n", + "- This notebook downloads 4 recent Elastic SEC 10-Q quarterly reports\n", + "- If you already have PDF files, feel free to place them in `./pdf` folder " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "\n", + "\n", + "def download_pdf(url, directory=\"./pdf\", filename=None):\n", + " if not os.path.exists(directory):\n", + " os.makedirs(directory)\n", + "\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " if filename is None:\n", + " filename = url.split(\"/\")[-1]\n", + " filepath = os.path.join(directory, filename)\n", + " with open(filepath, \"wb\") as file:\n", + " file.write(response.content)\n", + " print(f\"Downloaded {filepath}\")\n", + " else:\n", + " print(f\"Failed to download file from {url}\")\n", + "\n", + "\n", + "print(\"Downloading 4 recent 10-Q reports for Elastic NV.\")\n", + "base_url = \"https://s201.q4cdn.com/217177842/files/doc_financials\"\n", + "download_pdf(\n", + " f\"{base_url}/2025/q2/e5aa7a0a-6f56-468d-a5bd-661792773d71.pdf\",\n", + " filename=\"elastic-10Q-Q2-2025.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2025/q1/18656e06-8107-4423-8e2b-6f2945438053.pdf\",\n", + " filename=\"elastic-10Q-Q1-2025.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2024/q4/9949f03b-09fb-4941-b105-62a304dc1411.pdf\",\n", + " filename=\"elastic-10Q-Q4-2024.pdf\",\n", + ")\n", + "download_pdf(\n", + " f\"{base_url}/2024/q3/7e60e3bd-ff50-4ae8-ab12-5b3ae19420e6.pdf\",\n", + " filename=\"elastic-10Q-Q3-2024.pdf\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Azure AI Document Intelligence Imports and Environment Variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from azure.core.credentials import AzureKeyCredential\n", + "from azure.ai.documentintelligence import DocumentIntelligenceClient\n", + "from azure.ai.documentintelligence.models import AnalyzeResult\n", + "from azure.ai.documentintelligence.models import AnalyzeDocumentRequest\n", + "import json\n", + "from dotenv import load_dotenv\n", + "from tqdm import tqdm\n", + "\n", + "load_dotenv()\n", + "\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv(\n", + " \"AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT\"\n", + ")\n", + "AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY = os.getenv(\n", + " \"AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse paragraphs using AnalyzeResult\n", + "\n", + "This function extracts the paragraph text via an AnalyzeResult on a PDF file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_paragraphs(analyze_result):\n", + " table_offsets = []\n", + " page_content = {}\n", + "\n", + " for paragraph in analyze_result.paragraphs:\n", + " for span in paragraph.spans:\n", + " if span.offset not in table_offsets:\n", + " for region in paragraph.bounding_regions:\n", + " page_number = region.page_number\n", + " if page_number not in page_content:\n", + " page_content[page_number] = []\n", + " page_content[page_number].append(\n", + " {\"content_text\": paragraph.content}\n", + " )\n", + " return page_content, table_offsets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parse tables using AnalyzeResult\n", + "\n", + "This function extracts the paragraph text via an AnalyzeResult on a PDF file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_tables(analyze_result, table_offsets):\n", + " page_content = {}\n", + "\n", + " for table in analyze_result.tables:\n", + " table_data = []\n", + " for region in table.bounding_regions:\n", + " page_number = region.page_number\n", + " for cell in table.cells:\n", + " for span in cell.spans:\n", + " table_offsets.append(span.offset)\n", + " table_data.append(\n", + " f\"Cell [{cell.row_index}, {cell.column_index}]: {cell.content}\"\n", + " )\n", + "\n", + " if page_number not in page_content:\n", + " page_content[page_number] = []\n", + "\n", + " page_content[page_number].append({\"content_text\": \"\\n\".join(table_data)})\n", + "\n", + " return page_content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Combine paragraph and table text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def combine_paragraphs_tables(filepath, paragraph_content, table_content):\n", + " page_content_concatenated = {}\n", + " structured_data = []\n", + "\n", + " # Combine paragraph and table content\n", + " for p_number in set(paragraph_content.keys()).union(table_content.keys()):\n", + " concatenated_text = \"\"\n", + "\n", + " if p_number in paragraph_content:\n", + " for content in paragraph_content[p_number]:\n", + " concatenated_text += content[\"content_text\"] + \"\\n\"\n", + "\n", + " if p_number in table_content:\n", + " for content in table_content[p_number]:\n", + " concatenated_text += content[\"content_text\"] + \"\\n\"\n", + "\n", + " page_content_concatenated[p_number] = concatenated_text.strip()\n", + "\n", + " # Append a single item per page to the structured_data list\n", + " for p_number, concatenated_text in page_content_concatenated.items():\n", + " structured_data.append(\n", + " {\n", + " \"page_number\": p_number,\n", + " \"content_text\": concatenated_text,\n", + " \"pdf_file\": os.path.basename(filepath),\n", + " }\n", + " )\n", + "\n", + " return structured_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bring it all together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pdf_files = [\n", + " os.path.join(input_folder_pdf, file)\n", + " for file in os.listdir(input_folder_pdf)\n", + " if file.endswith(\".pdf\")\n", + "]\n", + "\n", + "document_intelligence_client = DocumentIntelligenceClient(\n", + " endpoint=AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT,\n", + " credential=AzureKeyCredential(AZURE_AI_DOCUMENT_INTELLIGENCE_API_KEY),\n", + " connection_timeout=600,\n", + ")\n", + "\n", + "for filepath in tqdm(pdf_files, desc=\"Parsing PDF files\"):\n", + " with open(filepath, \"rb\") as file:\n", + " poller = document_intelligence_client.begin_analyze_document(\n", + " \"prebuilt-layout\", AnalyzeDocumentRequest(bytes_source=file.read())\n", + " )\n", + "\n", + " analyze_result: AnalyzeResult = poller.result()\n", + "\n", + " paragraph_content, table_offsets = parse_paragraphs(analyze_result)\n", + " table_content = parse_tables(analyze_result, table_offsets)\n", + " structured_data = combine_paragraphs_tables(\n", + " filepath, paragraph_content, table_content\n", + " )\n", + "\n", + " # Convert the structured data to JSON format\n", + " json_output = json.dumps(structured_data, indent=4)\n", + "\n", + " # Get the filename without the \".pdf\" extension\n", + " filename_without_ext = os.path.splitext(os.path.basename(filepath))[0]\n", + " # Write the JSON output to a file\n", + " output_json_file = f\"{output_folder_pdf}/{filename_without_ext}.json\"\n", + "\n", + " with open(output_json_file, \"w\") as json_file:\n", + " json_file.write(json_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set imports for the elasticsearch client and environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from dotenv import load_dotenv\n", + "from elasticsearch import Elasticsearch\n", + "from tqdm import tqdm\n", + "import os\n", + "\n", + "load_dotenv()\n", + "\n", + "ES_URL = os.getenv(\"ES_URL\")\n", + "ES_API_KEY = os.getenv(\"ES_API_KEY\")\n", + "\n", + "es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create index in Elastic Cloud Serverless" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_name = \"pdf-chat\"\n", + "index_body = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"page_content\": {\n", + " \"type\": \"text\",\n", + " \"copy_to\": [\"page_content_sparse\", \"page_content_dense\"],\n", + " },\n", + " \"page_content_sparse\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": \".elser-2-elasticsearch\",\n", + " },\n", + " \"page_content_dense\": {\n", + " \"type\": \"semantic_text\",\n", + " \"inference_id\": \".multilingual-e5-small-elasticsearch\",\n", + " },\n", + " \"page_number\": {\"type\": \"text\"},\n", + " \"pdf_file\": {\"type\": \"text\", \"fields\": {\"keyword\": {\"type\": \"keyword\"}}},\n", + " }\n", + " }\n", + "}\n", + "\n", + "if es.indices.exists(index=index_name):\n", + " es.indices.delete(index=index_name)\n", + " print(f\"Index '{index_name}' deleted successfully.\")\n", + "\n", + "response = es.indices.create(index=index_name, body=index_body)\n", + "if \"acknowledged\" in response and response[\"acknowledged\"]:\n", + " print(f\"Index '{index_name}' created successfully.\")\n", + "elif \"error\" in response:\n", + " print(f\"Failed to create: '{index_name}'\")\n", + " print(f\"Error: {response['error']['reason']}\")\n", + "else:\n", + " print(f\"Index '{index_name}' already exists.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = os.listdir(output_folder_pdf)\n", + "with tqdm(total=len(files), desc=\"Indexing PDF docs\") as pbar_files:\n", + " for file in files:\n", + " with open(output_folder_pdf + \"/\" + file) as f:\n", + " data = json.loads(f.read())\n", + "\n", + " with tqdm(total=len(data), desc=f\"Processing {file}\") as pbar_pages:\n", + " for page in data:\n", + " doc = {\n", + " \"page_content\": page[\"content_text\"],\n", + " \"page_number\": page[\"page_number\"],\n", + " \"pdf_file\": page[\"pdf_file\"],\n", + " }\n", + " id = f\"{page['pdf_file']}_{page['page_number']}\"\n", + " es.index(index=index_name, id=id, body=json.dumps(doc))\n", + " pbar_pages.update(1)\n", + "\n", + " pbar_files.update(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt List\n", + "\n", + "1. Compare/contrast subscription revenue for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n", + "2. Provide an Income Taxes summary for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?\n", + "3. How has the balance sheet changed for Q2-2025, Q1-2025, Q4-2024 and Q3-2024?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/supporting-blog-content/unifying-elastic-vector-database-and-llms-for-intelligent-query/Unifying_Elastic_Vector_Database_and_LLMs_for_Intelligent_Query.ipynb b/supporting-blog-content/unifying-elastic-vector-database-and-llms-for-intelligent-query/Unifying_Elastic_Vector_Database_and_LLMs_for_Intelligent_Query.ipynb index 3c09acdf..aa145329 100644 --- a/supporting-blog-content/unifying-elastic-vector-database-and-llms-for-intelligent-query/Unifying_Elastic_Vector_Database_and_LLMs_for_Intelligent_Query.ipynb +++ b/supporting-blog-content/unifying-elastic-vector-database-and-llms-for-intelligent-query/Unifying_Elastic_Vector_Database_and_LLMs_for_Intelligent_Query.ipynb @@ -539,7 +539,9 @@ " print(f\"Error deleting template '{template_id}': {e}\")\n", "\n", "\n", - "def create_search_template(template_id, template_content):\n", + "def create_search_template(\n", + " template_id=TEMPLATE_ID, template_content=search_template_content\n", + "):\n", " \"\"\"Creates a new search template\"\"\"\n", " try:\n", " es.put_script(id=template_id, body=template_content)\n", @@ -1007,7 +1009,9 @@ "print(\"Creating hotels index...\")\n", "create_index()\n", "print(\"Ingesting hotels data...\")\n", - "ingest_data()" + "ingest_data()\n", + "print(\"Creating Search Template...\")\n", + "create_search_template()" ] }, {