Better splitting

pamelafox · pamelafox · commit 7130a2473ec1 · 2024-11-22T11:46:28.000-08:00
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str:
 
     @classmethod
     def blob_image_name_from_file_page(cls, filename, page=0) -> str:
-        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png"
+        return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png"
 
     @classmethod
     def blob_name_from_file_name(cls, filename) -> str:
diff --git a/app/backend/prepdocslib/cu_image.py b/app/backend/prepdocslib/cu_image.py
@@ -17,7 +17,7 @@
 PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
 PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
 
-analyzer_name = "image_schema_analyzer"
+analyzer_name = "image_analyzer"
 image_schema = {
     "analyzerId": analyzer_name,
     "name": "Image understanding",
@@ -27,33 +27,11 @@
     "config": {"returnDetails": False},
     "fieldSchema": {
         "name": "ImageInformation",
-        "descriptions": "Structured information from images.",
+        "descriptions": "Description of image.",
         "fields": {
-            "Title": {
+            "Description": {
                 "type": "string",
-                "description": "Title for the image (either taken from the image directly or a good short title based off content)",
-            },
-            "ImageType": {
-                "type": "string",
-                "description": "The type of image.",
-                "kind": "classify",
-                "enum": [
-                    "chart",
-                    "diagram",
-                    "table",
-                    "figure",
-                    "photo",
-                    "screenshot",
-                    "logo",
-                    "icon",
-                    "map",
-                    "infographic",
-                    "other",
-                ],
-            },
-            "MarkdownDescription": {
-                "type": "string",
-                "description": "Description of the image in markdown format. Start with a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in tabular markdown format, with valid syntax and accurate numbers. If the image is a chart, describe any axis or legends.",
+                "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
             },
         },
     },
@@ -133,4 +111,4 @@ async def poll():
 
                 results = await poll()
                 fields = results["result"]["contents"][0]["fields"]
-                return f"Title: {fields['Title']['valueString']}\n\nType: {fields['ImageType']['valueString']}\n\nDescription: {fields['MarkdownDescription']['valueString']}"
+                return fields["DescriptionHTML"]["valueString"]
diff --git a/app/backend/prepdocslib/figure_output.json b/app/backend/prepdocslib/figure_output.json
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
@@ -3,7 +3,7 @@ class Page:
     A single page from a document
 
     Attributes:
-        page_num (int): Page number
+        page_num (int): Page number (0-indexed)
         offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
         text (str): The text of the page
     """
@@ -17,6 +17,10 @@ def __init__(self, page_num: int, offset: int, text: str):
 class SplitPage:
     """
     A section of a page that has been split into a smaller chunk.
+
+    Attributes:
+        page_num (int): Page number (0-indexed)
+        text (str): The text of the section
     """
 
     def __init__(self, page_num: int, text: str):
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -1,5 +1,6 @@
 import html
 import io
+import json
 import logging
 from enum import Enum
 from typing import IO, AsyncGenerator, Union
@@ -8,6 +9,7 @@
 from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
 from azure.ai.documentintelligence.models import (
     AnalyzeDocumentRequest,
+    AnalyzeResult,
     DocumentFigure,
     DocumentTable,
 )
@@ -83,19 +85,20 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                 poller = await document_intelligence_client.begin_analyze_document(
                     model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
                 )
-            form_recognizer_results = await poller.result()
+            form_recognizer_results: AnalyzeResult = await poller.result()
 
             offset = 0
-            for page_num, page in enumerate(form_recognizer_results.pages):
+            pages_json = []
+            for page in form_recognizer_results.pages:
                 tables_on_page = [
                     table
                     for table in (form_recognizer_results.tables or [])
-                    if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
+                    if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
                 ]
                 figures_on_page = [
                     figure
                     for figure in (form_recognizer_results.figures or [])
-                    if figure.bounding_regions and figure.bounding_regions[0].page_number == page_num + 1
+                    if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
                 ]
 
                 class ObjectType(Enum):
@@ -135,13 +138,26 @@ class ObjectType(Enum):
                             added_objects.add(mask_char)
                     elif object_type == ObjectType.FIGURE:
                         if mask_char not in added_objects:
-                            page_text += await DocumentAnalysisParser.figure_to_html(
+                            figure_html = await DocumentAnalysisParser.figure_to_html(
                                 doc_for_pymupdf, cu_manager, figures_on_page[object_idx]
                             )
+                            page_text += figure_html
                             added_objects.add(mask_char)
-                # TODO: reset page numbers based on the mask
-                yield Page(page_num=page_num, offset=offset, text=page_text)
+                # We remove these comments since they are not needed and skew the page numbers
+                page_text = page_text.replace("<!-- PageBreak -->", "")
+                # We remove excess newlines at the beginning and end of the page
+                page_text = page_text.strip()
+                yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
+                # Serialize the page text to a JSON and save it locally
+                page_json = {
+                    "page_num": page.page_number - 1,
+                    "offset": offset,
+                    "text": page_text,
+                }
+                pages_json.append(page_json)
                 offset += len(page_text)
+            with open("pages.json", "w") as f:
+                json.dump(pages_json, f)
 
     @staticmethod
     async def figure_to_html(
@@ -158,12 +174,12 @@ async def figure_to_html(
         page_number = figure.bounding_regions[0]["pageNumber"]  # 1-indexed
         cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
         figure_description = await cu_manager.verbalize_figure(cropped_img)
-        # TODO: add DI's original figcaption to this caption - figure.caption.content
-        return f"<figure><figcaption>{figure_description}</figcaption></figure>"
+        figure_title = (figure.caption and figure.caption.content) or ""
+        return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
 
     @staticmethod
     def table_to_html(table: DocumentTable):
-        table_html = "<table>"
+        table_html = "<figure><table>"
         rows = [
             sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
             for i in range(table.row_count)
@@ -179,7 +195,7 @@ def table_to_html(table: DocumentTable):
                     cell_spans += f" rowSpan={cell.row_span}"
                 table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
             table_html += "</tr>"
-        table_html += "</table>"
+        table_html += "</table></figure>"
         return table_html
 
     @staticmethod
diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py
@@ -103,6 +103,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
         tokens = bpe.encode(text)
         if len(tokens) <= self.max_tokens_per_section:
             # Section is already within max tokens, return
+            print(f"Page {page_num}: {text}")
             yield SplitPage(page_num=page_num, text=text)
         else:
             # Start from the center and try and find the closest sentence ending by spiralling outward.
@@ -192,24 +193,15 @@ def find_page(offset):
             section_text = all_text[start:end]
             yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
 
-            last_table_start = section_text.rfind("<table")
             last_figure_start = section_text.rfind("<figure")
-            if last_table_start > 2 * self.sentence_search_limit and last_table_start > section_text.rfind("</table"):
-                # If the section ends with an unclosed table, we need to start the next section with the table.
-                # If table starts inside sentence_search_limit, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
-                # If last table starts inside section_overlap, keep overlapping
-                logger.info(
-                    f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}"
-                )
-                start = min(end - self.section_overlap, start + last_table_start)
-            elif last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
+            if last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
                 "</figure"
             ):
                 # If the section ends with an unclosed figure, we need to start the next section with the figure.
+                start = min(end - self.section_overlap, start + last_figure_start)
                 logger.info(
                     f"Section ends with unclosed figure, starting next section with the figure at page {find_page(start)} offset {start} figure start {last_figure_start}"
                 )
-                start = min(end - self.section_overlap, start + last_figure_start)
             else:
                 start = end - self.section_overlap
 
diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md
@@ -69,7 +69,7 @@ A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull
 
 You may want to remove documents from the index. For example, if you're using the sample data, you may want to remove the documents that are already in the index before adding your own.
 
-To remove all documents, use `scripts/prepdocs.sh --removeall` or `scripts/prepdocs.ps1 --removeall`.
+To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/prepdocs.ps1 --removeall`.
 
 You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`.
 
diff --git a/pages.json b/pages.json
diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh