Update the chunker

BenConstable9 · BenConstable9 · commit 09b671b69f40 · 2024-11-25T15:11:59.000Z
diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py
@@ -38,12 +38,19 @@ async def build_and_clean_markdown_for_response(
         str: The cleaned Markdown text.
     """
 
-    comment_patterns = r"<!-- PageNumber=\"[^\"]*\" -->|<!-- PageHeader=\"[^\"]*\" -->|<!-- PageFooter=\"[^\"]*\" -->|<!-- PageBreak -->|<!-- Footnote=\"[^\"]*\" -->"
-    cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
+    # Pattern to match the comment start `<!--` and comment end `-->`
+    # Matches opening `<!--` up to the first occurrence of a non-hyphen character
+    comment_start_pattern = r"<!--[^<]*"
+    comment_end_pattern = r"(-->|\<)"
+
+    # Using re.sub to remove comments
+    cleaned_text = re.sub(
+        f"{comment_start_pattern}.*?{comment_end_pattern}", "", markdown_text
+    )
 
     # Remove irrelevant figures
     if remove_irrelevant_figures:
-        irrelevant_figure_pattern = r"<!-- FigureContent=\"Irrelevant Image\" -->\s*"
+        irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
         cleaned_text = re.sub(
             irrelevant_figure_pattern, "", cleaned_text, flags=re.DOTALL
         )
diff --git a/adi_function_app/semantic_text_chunker.py b/adi_function_app/semantic_text_chunker.py
@@ -7,25 +7,35 @@
 import spacy
 import numpy as np
 
+logging.basicConfig(level=logging.INFO)
+
 
 class SemanticTextChunker:
     def __init__(
         self,
         num_surrounding_sentences: int = 1,
         similarity_threshold: float = 0.8,
         max_chunk_tokens: int = 200,
+        min_chunk_tokens: int = 50,
     ):
         self.num_surrounding_sentences = num_surrounding_sentences
         self.similarity_threshold = similarity_threshold
         self.max_chunk_tokens = max_chunk_tokens
+        self.min_chunk_tokens = min_chunk_tokens
         try:
             self._nlp_model = spacy.load("en_core_web_md")
         except IOError as e:
             raise ValueError("Spacy model 'en_core_web_md' not found.") from e
 
+    def sentence_contains_figure_or_table_ending(self, text: str):
+        return "</figure>" in text or "</table>" in text
+
     def sentence_contains_figure_or_table(self, text: str):
-        return ("<figure" in text or "</figure>" in text) or (
-            "<table>" in text or "</table>" in text
+        return (
+            ("<figure" in text or "</figure>" in text)
+            or ("<table>" in text or "</table>" in text)
+            or ("<th" in text or "th>" in text)
+            or ("<td" in text or "td>" in text)
         )
 
     def sentence_is_complete_figure_or_table(self, text: str):
@@ -59,6 +69,7 @@ async def chunk(self, text: str) -> list[dict]:
             list(str): The list of chunks"""
 
         sentences = self.split_into_sentences(text)
+
         (
             grouped_sentences,
             is_table_or_figure_map,
@@ -68,15 +79,54 @@ async def chunk(self, text: str) -> list[dict]:
             grouped_sentences, is_table_or_figure_map
         )
 
+        logging.info(
+            f"""Number of Forward pass chunks: {
+                     len(forward_pass_chunks)}"""
+        )
+        logging.info(f"Forward pass chunks: {forward_pass_chunks}")
+
         backwards_pass_chunks, _ = self.merge_chunks(
             forward_pass_chunks, new_is_table_or_figure_map, forwards_direction=False
         )
 
-        backwards_pass_chunks = list(
-            map(lambda x: x.strip(), reversed(backwards_pass_chunks))
+        reversed_backwards_pass_chunks = list(reversed(backwards_pass_chunks))
+
+        logging.info(
+            f"""Number of Backaward pass chunks: {
+                     len(reversed_backwards_pass_chunks)}"""
         )
+        logging.info(f"Backward pass chunks: {reversed_backwards_pass_chunks}")
+
+        cleaned_final_chunks = []
+        for chunk in reversed_backwards_pass_chunks:
+            stripped_chunk = chunk.strip()
+            if len(stripped_chunk) > 0:
+                cleaned_final_chunks.append(stripped_chunk)
+
+        logging.info(f"Number of final chunks: {len(cleaned_final_chunks)}")
+        logging.info(f"Chunks: {cleaned_final_chunks}")
+
+        return cleaned_final_chunks
+
+    def filter_empty_figures(self, text):
+        # Regular expression to match <figure>...</figure> with only newlines or spaces in between
+        pattern = r"<figure>\s*</figure>"
 
-        return list(reversed(backwards_pass_chunks))
+        # Replace any matches of the pattern with an empty string
+        filtered_text = re.sub(pattern, "", text)
+
+        return filtered_text
+
+    def clean_new_lines(self, text):
+        # Remove single newlines surrounded by < and >
+        cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
+
+        # Replace all other single newlines with space
+        cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
+
+        # Replace multiple consecutive newlines with a single space followed by \n\n
+        cleaned_text = re.sub(r"\n{2,}", " \n\n", cleaned_text)
+        return cleaned_text
 
     def split_into_sentences(self, text: str) -> list[str]:
         """Splits a set of text into a list of sentences uses the Spacy NLP model.
@@ -88,22 +138,45 @@ def split_into_sentences(self, text: str) -> list[str]:
             list(str): The extracted sentences
         """
 
-        def replace_newlines_outside_html(text):
-            def replacement(match):
-                # Only replace if \n is outside HTML tags
-                if "<" not in match.group(0) and ">" not in match.group(0):
-                    return match.group(0).replace("\n", " ")
-                return match.group(0)
+        cleaned_text = self.clean_new_lines(text)
+
+        # Filter out empty <figure>...</figure> tags
+        cleaned_text = self.filter_empty_figures(cleaned_text)
+
+        doc = self._nlp_model(cleaned_text)
+
+        tag_split_sentences = []
+        # Pattern to match the closing and opening tag junctions with whitespace in between
+        split_pattern = r"(</table>\s*<table\b[^>]*>|</figure>\s*<figure\b[^>]*>)"
+        for sent in doc.sents:
+            split_result = re.split(split_pattern, sent.text)
+            for part in split_result:
+                # Match the junction and split it into two parts
+                if re.match(split_pattern, part):
+                    # Split at the first whitespace
+                    tag_split = part.split(" ", 1)
+                    # Add the closing tag (e.g., </table>)
+                    tag_split_sentences.append(tag_split[0])
+                    if len(tag_split) > 1:
+                        # Add the rest of the string with leading space
+                        tag_split_sentences.append(" " + tag_split[1])
+                else:
+                    tag_split_sentences.append(part)
 
-            # Match sequences of non-whitespace characters with \n outside tags
-            return re.sub(r"[^<>\s]+\n[^<>\s]+", replacement, text)
+        # Now apply a split pattern against markdown headings
+        heading_split_sentences = []
 
-        doc = self._nlp_model(replace_newlines_outside_html(text))
-        sentences = [sent.text for sent in doc.sents]
+        # Iterate through each sentence in tag_split_sentences
+        for sent in tag_split_sentences:
+            # Use re.split to split on \n\n and headings, but keep \n\n in the result
+            split_result = re.split(r"(\n\n|#+ .*)", sent)
 
-        print(len(sentences))
+            # Extend the result with the correctly split parts, retaining \n\n before the heading
+            for part in split_result:
+                if part.strip():  # Only add non-empty parts
+                    heading_split_sentences.append(part)
 
-        return sentences
+        return heading_split_sentences
 
     def group_figures_and_tables_into_sentences(self, sentences: list[str]):
         grouped_sentences = []
@@ -125,7 +198,7 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
                     is_table_or_figure_map.append(False)
             else:
                 # check for ending case
-                if self.sentence_contains_figure_or_table(current_sentence):
+                if self.sentence_contains_figure_or_table_ending(current_sentence):
                     holding_sentences.append(current_sentence)
 
                     full_sentence = " ".join(holding_sentences)
@@ -137,6 +210,8 @@ def group_figures_and_tables_into_sentences(self, sentences: list[str]):
                 else:
                     holding_sentences.append(current_sentence)
 
+        assert len(holding_sentences) == 0, "Holding sentences should be empty"
+
         return grouped_sentences, is_table_or_figure_map
 
     def look_ahead_and_behind_sentences(
@@ -183,31 +258,50 @@ def look_ahead_and_behind_sentences(
 
     def merge_similar_chunks(self, current_sentence, current_chunk, forwards_direction):
         new_chunk = None
-        # Only compare when we have 2 or more chunks
 
-        if forwards_direction is False:
-            directional_current_chunk = list(reversed(current_chunk))
-        else:
-            directional_current_chunk = current_chunk
+        def retrieve_current_chunk_up_to_n(n):
+            if forwards_direction:
+                return " ".join(current_chunk[:-n])
+            else:
+                return " ".join(reversed(current_chunk[:-n]))
 
-        if len(current_chunk) >= 2:
+        def retrieve_current_chunks_from_n(n):
+            if forwards_direction:
+                return " ".join(current_chunk[n:])
+            else:
+                return " ".join(reversed(current_chunk[:-n]))
+
+        def retrive_current_chunk_at_n(n):
+            if forwards_direction:
+                return current_chunk[n]
+            else:
+                return current_chunk[n]
+
+        current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
+
+        if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
+            logging.debug("Comparing chunks")
             cosine_sim = self.sentence_similarity(
-                " ".join(directional_current_chunk[-2:]), current_sentence
+                retrieve_current_chunks_from_n(-2), current_sentence
             )
             if (
                 cosine_sim < self.similarity_threshold
-                or self.num_tokens_from_string(" ".join(directional_current_chunk))
-                >= self.max_chunk_tokens
+                or current_chunk_tokens >= self.max_chunk_tokens
             ):
                 if len(current_chunk) > 2:
-                    new_chunk = " ".join(directional_current_chunk[:1])
-                    current_chunk = [directional_current_chunk[-1]]
+                    new_chunk = retrieve_current_chunk_up_to_n(1)
+                    current_chunk = [retrive_current_chunk_at_n(-1)]
                 else:
-                    new_chunk = current_chunk[0]
-                    current_chunk = [current_chunk[1]]
+                    new_chunk = retrive_current_chunk_at_n(0)
+                    current_chunk = [retrive_current_chunk_at_n(1)]
+        else:
+            logging.debug("Chunk too small to compare")
 
         return new_chunk, current_chunk
 
+    def is_markdown_heading(self, text):
+        return text.strip().startswith("#")
+
     def merge_chunks(self, sentences, is_table_or_figure_map, forwards_direction=True):
         chunks = []
         current_chunk = []
@@ -230,6 +324,10 @@ def retrieve_current_chunk():
 
             current_sentence = sentences[current_sentence_index]
 
+            if len(current_sentence.strip()) == 0:
+                index += 1
+                continue
+
             # Detect if table or figure
             if is_table_or_figure_map[current_sentence_index]:
                 if forwards_direction:
@@ -244,7 +342,7 @@ def retrieve_current_chunk():
                     # On the backwards pass we don't want to add to the table chunk
                     chunks.append(retrieve_current_chunk())
                     new_is_table_or_figure_map.append(True)
-                    current_chunk.append(current_sentence)
+                    current_chunk = [current_sentence]
 
                 index += 1
                 continue
@@ -260,11 +358,18 @@ def retrieve_current_chunk():
                 )
 
                 if is_table_or_figure_behind:
-                    # Finish off
-                    current_chunk.append(current_sentence)
-                    chunks.append(retrieve_current_chunk())
-                    new_is_table_or_figure_map.append(False)
-                    current_chunk = []
+                    # Check if Makrdown heading
+                    if self.is_markdown_heading(current_sentence):
+                        # Start new chunk
+                        chunks.append(retrieve_current_chunk())
+                        new_is_table_or_figure_map.append(False)
+                        current_chunk = [current_sentence]
+                    else:
+                        # Finish off
+                        current_chunk.append(current_sentence)
+                        chunks.append(retrieve_current_chunk())
+                        new_is_table_or_figure_map.append(False)
+                        current_chunk = []
 
                     index += 1
                     continue
@@ -307,7 +412,7 @@ def retrieve_current_chunk():
             index += 1
 
         if len(current_chunk) > 0:
-            final_chunk = " ".join(current_chunk)
+            final_chunk = retrieve_current_chunk()
             chunks.append(final_chunk)
 
             new_is_table_or_figure_map.append(
@@ -322,7 +427,13 @@ def sentence_similarity(self, text_1, text_2):
 
         dot_product = np.dot(vec1, vec2)
         magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
-        return dot_product / magnitude if magnitude != 0 else 0.0
+        similarity = dot_product / magnitude if magnitude != 0 else 0.0
+
+        logging.debug(
+            f"""Similarity between '{text_1}' and '{
+            text_2}': {similarity}"""
+        )
+        return similarity
 
 
 async def process_semantic_text_chunker(record: dict, text_chunker) -> dict: