Skip to content

Commit 8ec9514

Browse files
committed
Tests added for PDFParser
1 parent b8c4d94 commit 8ec9514

8 files changed

+361
-18
lines changed

app/backend/prepdocslib/pdfparser.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -90,20 +90,20 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
9090
poller = await document_intelligence_client.begin_analyze_document(
9191
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
9292
)
93-
form_recognizer_results: AnalyzeResult = await poller.result()
93+
analyze_result: AnalyzeResult = await poller.result()
9494

9595
offset = 0
96-
for page in form_recognizer_results.pages:
96+
for page in analyze_result.pages:
9797
tables_on_page = [
9898
table
99-
for table in (form_recognizer_results.tables or [])
99+
for table in (analyze_result.tables or [])
100100
if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
101101
]
102102
figures_on_page = []
103103
if self.use_content_understanding:
104104
figures_on_page = [
105105
figure
106-
for figure in (form_recognizer_results.figures or [])
106+
for figure in (analyze_result.figures or [])
107107
if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
108108
]
109109

@@ -112,17 +112,18 @@ class ObjectType(Enum):
112112
TABLE = 0
113113
FIGURE = 1
114114

115-
# mark all positions of the table spans in the page
116115
page_offset = page.spans[0].offset
117116
page_length = page.spans[0].length
118117
mask_chars: list[tuple[ObjectType, Union[int, None]]] = [(ObjectType.NONE, None)] * page_length
118+
# mark all positions of the table spans in the page
119119
for table_idx, table in enumerate(tables_on_page):
120120
for span in table.spans:
121121
# replace all table spans with "table_id" in table_chars array
122122
for i in range(span.length):
123123
idx = span.offset - page_offset + i
124124
if idx >= 0 and idx < page_length:
125125
mask_chars[idx] = (ObjectType.TABLE, table_idx)
126+
# mark all positions of the figure spans in the page
126127
for figure_idx, figure in enumerate(figures_on_page):
127128
for span in figure.spans:
128129
# replace all figure spans with "figure_id" in figure_chars array
@@ -137,7 +138,7 @@ class ObjectType(Enum):
137138
for idx, mask_char in enumerate(mask_chars):
138139
object_type, object_idx = mask_char
139140
if object_type == ObjectType.NONE:
140-
page_text += form_recognizer_results.content[page_offset + idx]
141+
page_text += analyze_result.content[page_offset + idx]
141142
elif object_type == ObjectType.TABLE:
142143
if object_idx is None:
143144
raise ValueError("Expected object_idx to be set")
@@ -151,7 +152,7 @@ class ObjectType(Enum):
151152
raise ValueError("Expected object_idx to be set")
152153
if mask_char not in added_objects:
153154
figure_html = await DocumentAnalysisParser.figure_to_html(
154-
doc_for_pymupdf, cu_describer, figures_on_page[object_idx]
155+
doc_for_pymupdf, figures_on_page[object_idx], cu_describer
155156
)
156157
page_text += figure_html
157158
added_objects.add(mask_char)
@@ -164,21 +165,23 @@ class ObjectType(Enum):
164165

165166
@staticmethod
166167
async def figure_to_html(
167-
doc: pymupdf.Document, cu_describer: ContentUnderstandingDescriber, figure: DocumentFigure
168+
doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber
168169
) -> str:
169170
figure_title = (figure.caption and figure.caption.content) or ""
170171
logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
171172
if not figure.bounding_regions:
172173
return f"<figure><figcaption>{figure_title}</figcaption></figure>"
173-
for region in figure.bounding_regions:
174-
# To learn more about bounding regions, see https://aka.ms/bounding-region
175-
bounding_box = (
176-
region.polygon[0], # x0 (left)
177-
region.polygon[1], # y0 (top
178-
region.polygon[4], # x1 (right)
179-
region.polygon[5], # y1 (bottom)
180-
)
181-
page_number = figure.bounding_regions[0]["pageNumber"] # 1-indexed
174+
if len(figure.bounding_regions) > 1:
175+
logger.warning("Figure %s has more than one bounding region, using the first one", figure.id)
176+
first_region = figure.bounding_regions[0]
177+
# To learn more about bounding regions, see https://aka.ms/bounding-region
178+
bounding_box = (
179+
first_region.polygon[0], # x0 (left)
180+
first_region.polygon[1], # y0 (top
181+
first_region.polygon[4], # x1 (right)
182+
first_region.polygon[5], # y1 (bottom)
183+
)
184+
page_number = first_region["pageNumber"] # 1-indexed
182185
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
183186
figure_description = await cu_describer.describe_image(cropped_img)
184187
return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
@@ -205,7 +208,7 @@ def table_to_html(table: DocumentTable):
205208
return table_html
206209

207210
@staticmethod
208-
def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box) -> bytes:
211+
def crop_image_from_pdf_page(doc: pymupdf.Document, page_number: int, bounding_box: tuple[float]) -> bytes:
209212
"""
210213
Crops a region from a given page in a PDF and returns it as an image.
211214
202 KB
Binary file not shown.
73.6 KB
Loading

tests/test-data/Simple Figure.pdf

44.3 KB
Binary file not shown.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Simple Figure
2+
3+
This text is before the figure and NOT part of it.
4+
5+
6+
<figure>
7+
8+
9
9+
10+
</figure>
11+
12+
13+
This is text after the figure that's not part of it.

tests/test-data/Simple Table.pdf

26 KB
Binary file not shown.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Simple HTML Table
2+
3+
4+
<table>
5+
<tr>
6+
<th>Header 1</th>
7+
<th>Header 2</th>
8+
</tr>
9+
<tr>
10+
<td>Cell 1</td>
11+
<td>Cell 2</td>
12+
</tr>
13+
<tr>
14+
<td>Cell 3</td>
15+
<td>Cell 4</td>
16+
</tr>
17+
</table>

0 commit comments

Comments
 (0)