Skip to content

Commit 2c6a3db

Browse files
Layout Merger Fixes & Text Chunker Optimisation (#161)
1 parent bac5566 commit 2c6a3db

File tree

5 files changed

+225
-145
lines changed

5 files changed

+225
-145
lines changed

deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def get_semantic_chunker_skill(
301301

302302
semantic_text_chunker_skill_inputs = [
303303
InputFieldMappingEntry(
304-
name="content", source="/document/layout/merged_content"
304+
name="content", source="/document/layout_merged_content"
305305
)
306306
]
307307

@@ -486,7 +486,6 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
486486
batch_size = 1
487487
degree_of_parallelism = 8
488488

489-
output = [OutputFieldMappingEntry(name="content", target_name="merged_content")]
490489
if chunk_by_page:
491490
merger_context = "/document/page_wise_layout/*"
492491
inputs = [
@@ -498,15 +497,23 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
498497
source="/document/page_wise_layout/*/figures/*/updated_figure",
499498
),
500499
]
500+
output = [
501+
OutputFieldMappingEntry(name="content", target_name="merged_content")
502+
]
501503
else:
502-
merger_context = "/document/layout"
504+
merger_context = "/document"
503505

504506
inputs = [
505507
InputFieldMappingEntry(name="layout", source="/document/layout"),
506508
InputFieldMappingEntry(
507509
name="figures", source="/document/layout/figures/*/updated_figure"
508510
),
509511
]
512+
output = [
513+
OutputFieldMappingEntry(
514+
name="content", target_name="layout_merged_content"
515+
)
516+
]
510517

511518
figure_analysis_skill = WebApiSkill(
512519
name="Layout and Figure Merger Skill",

image_processing/src/image_processing/layout_and_figure_merger.py

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import re
66
from layout_holders import FigureHolder, LayoutHolder
7+
from typing import List
78

89

910
class LayoutAndFigureMerger:
@@ -18,37 +19,48 @@ def insert_figure_description(
1819
figure_holder (FigureHolder): The figure to be updated.
1920
2021
Returns:
21-
str: The updated Markdown content with the new figure description.
22+
int: The change in length of the Markdown content after updating the figure description.
2223
"""
23-
2424
# Calculate the end index of the content to be replaced
2525
end_index = figure_holder.offset + figure_holder.length
2626

27-
# Ensure that the end_index does not exceed the length of the Markdown content
27+
# Ensure the offset is valid
28+
if figure_holder.offset < 0 or figure_holder.offset > len(
29+
layout_holder.content
30+
):
31+
logging.error("Figure offset is out of bounds.")
32+
raise ValueError("Figure offset is out of bounds.")
33+
34+
# Ensure the end index does not exceed the length of the Markdown content
2835
if end_index > len(layout_holder.content):
2936
logging.info(
30-
"End index exceeds the length of the content. Adjusting the end index to the length of the content."
37+
"End index exceeds the length of the content. Adjusting to the length of the content."
3138
)
3239
end_index = len(layout_holder.content)
3340

41+
logging.info(f"Figure Markdown Content: {figure_holder.markdown}")
42+
3443
# Replace the old string with the new string
3544
layout_holder.content = (
3645
layout_holder.content[: figure_holder.offset]
3746
+ figure_holder.markdown
3847
+ layout_holder.content[end_index:]
3948
)
4049

41-
return len(figure_holder.markdown) - figure_holder.length
50+
inserted_length = len(figure_holder.markdown) - figure_holder.length
51+
logging.info(f"Inserted Length: {inserted_length}")
52+
53+
return layout_holder, inserted_length
4254

4355
async def merge_figures_into_layout(
44-
self, layout: LayoutHolder, figures: list[FigureHolder]
56+
self, layout_holder: LayoutHolder, figures: List[FigureHolder]
4557
) -> LayoutHolder:
4658
"""
4759
Merges the figures into the layout.
4860
4961
Args:
50-
layout (LayoutHolder): The layout text.
51-
figures (list): The list of figures.
62+
layout_holder (LayoutHolder): The layout text.
63+
figures (List[FigureHolder]): The list of figures.
5264
5365
Returns:
5466
LayoutHolder: The updated layout text with the figures.
@@ -59,30 +71,51 @@ async def merge_figures_into_layout(
5971
# Iterate over the figures
6072
for figure in figures:
6173
logging.info(f"Inserting Figure: {figure.figure_id}")
74+
logging.info(f"Figure Description: {figure.description}")
6275
# Update the figure description in the layout
6376
figure.offset += running_offset
64-
length = self.insert_figure_description(layout, figure)
77+
layout_holder, inserted_length = self.insert_figure_description(
78+
layout_holder, figure
79+
)
6580

6681
# Update the offset
67-
running_offset += length
82+
running_offset += inserted_length
83+
84+
logging.info("Merged figures into layout.")
85+
logging.info("Updated Layout with Figures: %s", layout_holder.content)
86+
# Precompile regex patterns
87+
irrelevant_figure_pattern = re.compile(
88+
r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>",
89+
re.DOTALL,
90+
)
91+
empty_or_whitespace_figure_pattern = re.compile(
92+
r"<figure[^>]*>\s*</figure>", re.DOTALL
93+
)
94+
html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
6895

6996
# Remove irrelevant figures
70-
irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
71-
layout.content = re.sub(
72-
irrelevant_figure_pattern, "", layout.content, flags=re.DOTALL
97+
layout_holder.content = irrelevant_figure_pattern.sub("", layout_holder.content)
98+
logging.info("Removed irrelevant figures from layout.")
99+
logging.info(
100+
"Updated Layout without Irrelevant Figures: %s", layout_holder.content
73101
)
74102

75-
empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
76-
layout.content = re.sub(
77-
empty_or_whitespace_figure_pattern, "", layout.content, flags=re.DOTALL
103+
# Remove empty or whitespace figures
104+
layout_holder.content = empty_or_whitespace_figure_pattern.sub(
105+
"", layout_holder.content
78106
)
79-
80-
html_comments_pattern = r"<!--.*?-->"
81-
layout.content = re.sub(
82-
html_comments_pattern, "", layout.content, flags=re.DOTALL
107+
logging.info("Removed empty or whitespace figures from layout.")
108+
logging.info(
109+
"Updated Layout without Empty or Whitespace Figures: %s",
110+
layout_holder.content,
83111
)
84112

85-
return layout
113+
# Remove HTML comments
114+
layout_holder.content = html_comments_pattern.sub("", layout_holder.content)
115+
logging.info("Removed HTML comments from layout.")
116+
logging.info("Updated Layout without HTML Comments: %s", layout_holder.content)
117+
118+
return layout_holder
86119

87120
async def merge(self, record: dict) -> dict:
88121
"""
@@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
94127
Returns:
95128
- record (dict): The record containing the image, its caption, and the generated description.
96129
"""
97-
layout = LayoutHolder(**record["data"]["layout"])
130+
layout_holder = LayoutHolder(**record["data"]["layout"])
98131

99132
figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
100133

101134
try:
102-
logging.info(f"Input Data: {layout}")
103-
updated_layout = await self.merge_figures_into_layout(layout, figures)
104-
logging.info(f"Updated Data: {updated_layout}")
135+
logging.info(f"Input Data: {layout_holder}")
136+
updated_layout = await self.merge_figures_into_layout(
137+
layout_holder, figures
138+
)
139+
logging.info(f"Updated Layout Data: {updated_layout}")
105140
except Exception as e:
106141
logging.error(f"Failed to merge figures into layout. Error: {e}")
107142
return {
108143
"recordId": record["recordId"],
109-
"data": {},
144+
"data": None,
110145
"errors": [
111146
{
112147
"message": "Failed to merge figures into layout.",

image_processing/src/image_processing/requirements.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file was autogenerated by uv via the following command:
22
# uv export --frozen --no-hashes --no-editable --no-sources --no-group dev --directory image_processing -o src/image_processing/requirements.txt
33
aiohappyeyeballs==2.4.4
4-
aiohttp==3.11.11
4+
aiohttp==3.11.12
55
aiosignal==1.3.2
66
annotated-types==0.7.0
77
anyio==4.8.0
@@ -16,7 +16,7 @@ azure-identity==1.19.0
1616
azure-search==1.0.0b2
1717
azure-search-documents==11.6.0b8
1818
azure-storage-blob==12.24.1
19-
beautifulsoup4==4.12.3
19+
beautifulsoup4==4.13.3
2020
blis==0.7.11
2121
bs4==0.0.2
2222
catalogue==2.0.10
@@ -34,7 +34,7 @@ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_
3434
et-xmlfile==2.0.0
3535
filelock==3.17.0
3636
frozenlist==1.5.0
37-
fsspec==2024.12.0
37+
fsspec==2025.2.0
3838
h11==0.14.0
3939
httpcore==1.0.7
4040
httpx==0.28.1
@@ -50,15 +50,15 @@ marisa-trie==1.2.1
5050
markdown-it-py==3.0.0
5151
markupsafe==3.0.2
5252
mdurl==0.1.2
53-
model2vec==0.3.8
53+
model2vec==0.3.9
5454
msal==1.31.1
5555
msal-extensions==1.2.0
5656
msrest==0.7.1
5757
multidict==6.1.0
5858
murmurhash==1.0.12
5959
numpy==1.26.4
6060
oauthlib==3.2.2
61-
openai==1.60.2
61+
openai==1.61.1
6262
openpyxl==3.1.5
6363
packaging==24.2
6464
pandas==2.2.3
@@ -71,7 +71,7 @@ pydantic==2.10.6
7171
pydantic-core==2.27.2
7272
pygments==2.19.1
7373
pyjwt==2.10.1
74-
pymupdf==1.25.2
74+
pymupdf==1.25.3
7575
python-dateutil==2.9.0.post0
7676
python-dotenv==1.0.1
7777
pytz==2025.1

image_processing/src/image_processing/semantic_text_chunker.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
5959

6060
return len(encoding.encode(string))
6161

62+
def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
63+
cleaned_chunks = []
64+
cleaned_is_table_or_figure_map = []
65+
66+
for current_chunk, is_table_or_figure in zip(chunks, is_table_or_figure_map):
67+
cleaned_chunk = current_chunk.strip()
68+
if len(cleaned_chunk) > 0:
69+
# Add a newline if the chunk ends with a newline (it was a title)
70+
if self.is_markdown_heading(current_chunk):
71+
cleaned_chunk = "\n\n" + cleaned_chunk + "\n\n"
72+
73+
cleaned_chunks.append(cleaned_chunk)
74+
cleaned_is_table_or_figure_map.append(is_table_or_figure)
75+
76+
return cleaned_chunks, cleaned_is_table_or_figure_map
77+
6278
async def chunk(self, text: str) -> list[dict]:
6379
"""Attempts to chunk the text by:
6480
Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
86102
grouped_sentences, is_table_or_figure_map
87103
)
88104

105+
forward_pass_chunks, new_is_table_or_figure_map = self.clean_chunks_and_map(
106+
forward_pass_chunks, new_is_table_or_figure_map
107+
)
108+
89109
logging.info(
90110
f"""Number of Forward pass chunks: {
91111
len(forward_pass_chunks)}"""
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
129149

130150
def clean_new_lines(self, text):
131151
# Remove single newlines surrounded by < and >
132-
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
152+
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text.strip())
133153

134154
# Replace all other single newlines with space
135155
cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
190210
self.is_markdown_heading(part)
191211
and part.endswith("\n\n") is False
192212
):
193-
part = part + "\n\n"
213+
part = "\n\n" + part + "\n\n"
194214

195215
heading_split_sentences.append(part)
196216

@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
300320
else:
301321
return current_chunk[n]
302322

303-
current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
323+
def get_current_chunk_tokens(chunk_segments):
324+
return self.num_tokens_from_string(" ".join(chunk_segments))
325+
326+
current_chunk_tokens = get_current_chunk_tokens(current_chunk)
304327

305328
if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
306-
logging.info("Comparing chunks")
307-
cosine_sim = self.sentence_similarity(
308-
retrieve_current_chunks_from_n(-2), current_sentence
309-
)
329+
# Calculate the tokens if we were to split
330+
if len(current_chunk) > 2:
331+
would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
332+
would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
333+
else:
334+
would_be_new_chunk = retrive_current_chunk_at_n(0)
335+
would_be_current_chunk = [retrive_current_chunk_at_n(1)]
336+
310337
if (
311-
cosine_sim < self.similarity_threshold
312-
or current_chunk_tokens >= self.max_chunk_tokens
338+
get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
339+
and get_current_chunk_tokens(would_be_current_chunk)
340+
>= self.min_chunk_tokens
313341
):
314-
if len(current_chunk) > 2:
315-
new_chunk = retrieve_current_chunk_up_to_n(1)
316-
current_chunk = [retrive_current_chunk_at_n(-1)]
317-
else:
318-
new_chunk = retrive_current_chunk_at_n(0)
319-
current_chunk = [retrive_current_chunk_at_n(1)]
342+
logging.info("Comparing chunks")
343+
if (
344+
current_chunk_tokens >= self.max_chunk_tokens
345+
or self.sentence_similarity(
346+
retrieve_current_chunks_from_n(-2), current_sentence
347+
)
348+
< self.similarity_threshold
349+
):
350+
return would_be_new_chunk, would_be_current_chunk
351+
else:
352+
logging.info("Chunk too small to compare")
320353
else:
321354
logging.info("Chunk too small to compare")
322355

0 commit comments

Comments
 (0)