Layout Merger Fixes & Text Chunker Optimisation (#161)

BenConstable9 · web-flow · commit 2c6a3db4ef9d · 2025-02-06T16:33:54.000Z
diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
@@ -301,7 +301,7 @@ def get_semantic_chunker_skill(
 
         semantic_text_chunker_skill_inputs = [
             InputFieldMappingEntry(
-                name="content", source="/document/layout/merged_content"
+                name="content", source="/document/layout_merged_content"
             )
         ]
 
@@ -486,7 +486,6 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
             batch_size = 1
             degree_of_parallelism = 8
 
-        output = [OutputFieldMappingEntry(name="content", target_name="merged_content")]
         if chunk_by_page:
             merger_context = "/document/page_wise_layout/*"
             inputs = [
@@ -498,15 +497,23 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
                     source="/document/page_wise_layout/*/figures/*/updated_figure",
                 ),
             ]
+            output = [
+                OutputFieldMappingEntry(name="content", target_name="merged_content")
+            ]
         else:
-            merger_context = "/document/layout"
+            merger_context = "/document"
 
             inputs = [
                 InputFieldMappingEntry(name="layout", source="/document/layout"),
                 InputFieldMappingEntry(
                     name="figures", source="/document/layout/figures/*/updated_figure"
                 ),
             ]
+            output = [
+                OutputFieldMappingEntry(
+                    name="content", target_name="layout_merged_content"
+                )
+            ]
 
         figure_analysis_skill = WebApiSkill(
             name="Layout and Figure Merger Skill",
diff --git a/image_processing/src/image_processing/layout_and_figure_merger.py b/image_processing/src/image_processing/layout_and_figure_merger.py
@@ -4,6 +4,7 @@
 import logging
 import re
 from layout_holders import FigureHolder, LayoutHolder
+from typing import List
 
 
 class LayoutAndFigureMerger:
@@ -18,37 +19,48 @@ def insert_figure_description(
             figure_holder (FigureHolder): The figure to be updated.
 
         Returns:
-            str: The updated Markdown content with the new figure description.
+            int: The change in length of the Markdown content after updating the figure description.
         """
-
         # Calculate the end index of the content to be replaced
         end_index = figure_holder.offset + figure_holder.length
 
-        # Ensure that the end_index does not exceed the length of the Markdown content
+        # Ensure the offset is valid
+        if figure_holder.offset < 0 or figure_holder.offset > len(
+            layout_holder.content
+        ):
+            logging.error("Figure offset is out of bounds.")
+            raise ValueError("Figure offset is out of bounds.")
+
+        # Ensure the end index does not exceed the length of the Markdown content
         if end_index > len(layout_holder.content):
             logging.info(
-                "End index exceeds the length of the content. Adjusting the end index to the length of the content."
+                "End index exceeds the length of the content. Adjusting to the length of the content."
             )
             end_index = len(layout_holder.content)
 
+        logging.info(f"Figure Markdown Content: {figure_holder.markdown}")
+
         # Replace the old string with the new string
         layout_holder.content = (
             layout_holder.content[: figure_holder.offset]
             + figure_holder.markdown
             + layout_holder.content[end_index:]
         )
 
-        return len(figure_holder.markdown) - figure_holder.length
+        inserted_length = len(figure_holder.markdown) - figure_holder.length
+        logging.info(f"Inserted Length: {inserted_length}")
+
+        return layout_holder, inserted_length
 
     async def merge_figures_into_layout(
-        self, layout: LayoutHolder, figures: list[FigureHolder]
+        self, layout_holder: LayoutHolder, figures: List[FigureHolder]
     ) -> LayoutHolder:
         """
         Merges the figures into the layout.
 
         Args:
-            layout (LayoutHolder): The layout text.
-            figures (list): The list of figures.
+            layout_holder (LayoutHolder): The layout text.
+            figures (List[FigureHolder]): The list of figures.
 
         Returns:
             LayoutHolder: The updated layout text with the figures.
@@ -59,30 +71,51 @@ async def merge_figures_into_layout(
         # Iterate over the figures
         for figure in figures:
             logging.info(f"Inserting Figure: {figure.figure_id}")
+            logging.info(f"Figure Description: {figure.description}")
             # Update the figure description in the layout
             figure.offset += running_offset
-            length = self.insert_figure_description(layout, figure)
+            layout_holder, inserted_length = self.insert_figure_description(
+                layout_holder, figure
+            )
 
             # Update the offset
-            running_offset += length
+            running_offset += inserted_length
+
+        logging.info("Merged figures into layout.")
+        logging.info("Updated Layout with Figures: %s", layout_holder.content)
+        # Precompile regex patterns
+        irrelevant_figure_pattern = re.compile(
+            r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>",
+            re.DOTALL,
+        )
+        empty_or_whitespace_figure_pattern = re.compile(
+            r"<figure[^>]*>\s*</figure>", re.DOTALL
+        )
+        html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
 
         # Remove irrelevant figures
-        irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
-        layout.content = re.sub(
-            irrelevant_figure_pattern, "", layout.content, flags=re.DOTALL
+        layout_holder.content = irrelevant_figure_pattern.sub("", layout_holder.content)
+        logging.info("Removed irrelevant figures from layout.")
+        logging.info(
+            "Updated Layout without Irrelevant Figures: %s", layout_holder.content
         )
 
-        empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
-        layout.content = re.sub(
-            empty_or_whitespace_figure_pattern, "", layout.content, flags=re.DOTALL
+        # Remove empty or whitespace figures
+        layout_holder.content = empty_or_whitespace_figure_pattern.sub(
+            "", layout_holder.content
         )
-
-        html_comments_pattern = r"<!--.*?-->"
-        layout.content = re.sub(
-            html_comments_pattern, "", layout.content, flags=re.DOTALL
+        logging.info("Removed empty or whitespace figures from layout.")
+        logging.info(
+            "Updated Layout without Empty or Whitespace Figures: %s",
+            layout_holder.content,
         )
 
-        return layout
+        # Remove HTML comments
+        layout_holder.content = html_comments_pattern.sub("", layout_holder.content)
+        logging.info("Removed HTML comments from layout.")
+        logging.info("Updated Layout without HTML Comments: %s", layout_holder.content)
+
+        return layout_holder
 
     async def merge(self, record: dict) -> dict:
         """
@@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
         Returns:
         - record (dict): The record containing the image, its caption, and the generated description.
         """
-        layout = LayoutHolder(**record["data"]["layout"])
+        layout_holder = LayoutHolder(**record["data"]["layout"])
 
         figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
 
         try:
-            logging.info(f"Input Data: {layout}")
-            updated_layout = await self.merge_figures_into_layout(layout, figures)
-            logging.info(f"Updated Data: {updated_layout}")
+            logging.info(f"Input Data: {layout_holder}")
+            updated_layout = await self.merge_figures_into_layout(
+                layout_holder, figures
+            )
+            logging.info(f"Updated Layout Data: {updated_layout}")
         except Exception as e:
             logging.error(f"Failed to merge figures into layout. Error: {e}")
             return {
                 "recordId": record["recordId"],
-                "data": {},
+                "data": None,
                 "errors": [
                     {
                         "message": "Failed to merge figures into layout.",
diff --git a/image_processing/src/image_processing/requirements.txt b/image_processing/src/image_processing/requirements.txt
@@ -1,7 +1,7 @@
 # This file was autogenerated by uv via the following command:
 #    uv export --frozen --no-hashes --no-editable --no-sources --no-group dev --directory image_processing -o src/image_processing/requirements.txt
 aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
+aiohttp==3.11.12
 aiosignal==1.3.2
 annotated-types==0.7.0
 anyio==4.8.0
@@ -16,7 +16,7 @@ azure-identity==1.19.0
 azure-search==1.0.0b2
 azure-search-documents==11.6.0b8
 azure-storage-blob==12.24.1
-beautifulsoup4==4.12.3
+beautifulsoup4==4.13.3
 blis==0.7.11
 bs4==0.0.2
 catalogue==2.0.10
@@ -34,7 +34,7 @@ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_
 et-xmlfile==2.0.0
 filelock==3.17.0
 frozenlist==1.5.0
-fsspec==2024.12.0
+fsspec==2025.2.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
@@ -50,15 +50,15 @@ marisa-trie==1.2.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
 mdurl==0.1.2
-model2vec==0.3.8
+model2vec==0.3.9
 msal==1.31.1
 msal-extensions==1.2.0
 msrest==0.7.1
 multidict==6.1.0
 murmurhash==1.0.12
 numpy==1.26.4
 oauthlib==3.2.2
-openai==1.60.2
+openai==1.61.1
 openpyxl==3.1.5
 packaging==24.2
 pandas==2.2.3
@@ -71,7 +71,7 @@ pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
 pyjwt==2.10.1
-pymupdf==1.25.2
+pymupdf==1.25.3
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py
@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
 
         return len(encoding.encode(string))
 
+    def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
+        cleaned_chunks = []
+        cleaned_is_table_or_figure_map = []
+
+        for current_chunk, is_table_or_figure in zip(chunks, is_table_or_figure_map):
+            cleaned_chunk = current_chunk.strip()
+            if len(cleaned_chunk) > 0:
+                # Add a newline if the chunk ends with a newline (it was a title)
+                if self.is_markdown_heading(current_chunk):
+                    cleaned_chunk = "\n\n" + cleaned_chunk + "\n\n"
+
+                cleaned_chunks.append(cleaned_chunk)
+                cleaned_is_table_or_figure_map.append(is_table_or_figure)
+
+        return cleaned_chunks, cleaned_is_table_or_figure_map
+
     async def chunk(self, text: str) -> list[dict]:
         """Attempts to chunk the text by:
             Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
             grouped_sentences, is_table_or_figure_map
         )
 
+        forward_pass_chunks, new_is_table_or_figure_map = self.clean_chunks_and_map(
+            forward_pass_chunks, new_is_table_or_figure_map
+        )
+
         logging.info(
             f"""Number of Forward pass chunks: {
                 len(forward_pass_chunks)}"""
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
 
     def clean_new_lines(self, text):
         # Remove single newlines surrounded by < and >
-        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
+        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text.strip())
 
         # Replace all other single newlines with space
         cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
                         self.is_markdown_heading(part)
                         and part.endswith("\n\n") is False
                     ):
-                        part = part + "\n\n"
+                        part = "\n\n" + part + "\n\n"
 
                     heading_split_sentences.append(part)
 
@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
             else:
                 return current_chunk[n]
 
-        current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
+        def get_current_chunk_tokens(chunk_segments):
+            return self.num_tokens_from_string(" ".join(chunk_segments))
+
+        current_chunk_tokens = get_current_chunk_tokens(current_chunk)
 
         if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
-            logging.info("Comparing chunks")
-            cosine_sim = self.sentence_similarity(
-                retrieve_current_chunks_from_n(-2), current_sentence
-            )
+            # Calculate the tokens if we were to split
+            if len(current_chunk) > 2:
+                would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
+                would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
+            else:
+                would_be_new_chunk = retrive_current_chunk_at_n(0)
+                would_be_current_chunk = [retrive_current_chunk_at_n(1)]
+
             if (
-                cosine_sim < self.similarity_threshold
-                or current_chunk_tokens >= self.max_chunk_tokens
+                get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
+                and get_current_chunk_tokens(would_be_current_chunk)
+                >= self.min_chunk_tokens
             ):
-                if len(current_chunk) > 2:
-                    new_chunk = retrieve_current_chunk_up_to_n(1)
-                    current_chunk = [retrive_current_chunk_at_n(-1)]
-                else:
-                    new_chunk = retrive_current_chunk_at_n(0)
-                    current_chunk = [retrive_current_chunk_at_n(1)]
+                logging.info("Comparing chunks")
+                if (
+                    current_chunk_tokens >= self.max_chunk_tokens
+                    or self.sentence_similarity(
+                        retrieve_current_chunks_from_n(-2), current_sentence
+                    )
+                    < self.similarity_threshold
+                ):
+                    return would_be_new_chunk, would_be_current_chunk
+            else:
+                logging.info("Chunk too small to compare")
         else:
             logging.info("Chunk too small to compare")
 
diff --git a/uv.lock b/uv.lock