Skip to content

Commit ebfcfc5

Browse files
committed
Update more tests
1 parent 751abd1 commit ebfcfc5

File tree

5 files changed

+46
-19
lines changed

5 files changed

+46
-19
lines changed

app/backend/prepdocslib/filestrategy.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ async def parse_file(
2121
blob_manager: Optional[BlobManager] = None,
2222
image_embeddings_client: Optional[ImageEmbeddings] = None,
2323
) -> list[Section]:
24-
await blob_manager.upload_blob(file)
2524
key = file.file_extension().lower()
2625
processor = file_processors.get(key)
2726
if processor is None:

app/backend/prepdocslib/pdfparser.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,14 @@ async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_de
227227
"Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__
228228
)
229229
if not figure.bounding_regions:
230-
return f"<figure><figcaption>{figure_title}</figcaption></figure>"
230+
return ImageOnPage(
231+
bytes=b"",
232+
page_num=0,
233+
figure_id=figure.id,
234+
bbox=[0, 0, 0, 0],
235+
filename=f"figure{figure.id.replace('.', '_')}.png",
236+
description=f"<figure><figcaption>{figure_title}</figcaption></figure>",
237+
)
231238
if len(figure.bounding_regions) > 1:
232239
logger.warning("Figure %s has more than one bounding region, using the first one", figure.id)
233240
first_region = figure.bounding_regions[0]

tests/test_pdfparser.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
from PIL import Image, ImageChops
2323

2424
from prepdocslib.mediadescriber import ContentUnderstandingDescriber
25-
from prepdocslib.pdfparser import DocumentAnalysisParser
25+
from prepdocslib.page import ImageOnPage
26+
from prepdocslib.pdfparser import DocumentAnalysisParser, MediaDescriptionStrategy
2627

2728
from .mocks import MockAzureCredential
2829

@@ -44,11 +45,13 @@ def test_crop_image_from_pdf_page():
4445
page_number = 2
4546
bounding_box = (1.4703, 2.8371, 5.5381, 6.6022) # Coordinates in inches
4647

47-
cropped_image_bytes = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number, bounding_box)
48+
cropped_image_bytes, bbox_pixels = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number, bounding_box)
4849

4950
# Verify the output is not empty
5051
assert cropped_image_bytes is not None
5152
assert len(cropped_image_bytes) > 0
53+
assert bbox_pixels is not None
54+
assert len(bbox_pixels) == 4
5255

5356
# Verify the output is a valid image
5457
cropped_image = Image.open(io.BytesIO(cropped_image_bytes))
@@ -59,6 +62,8 @@ def test_crop_image_from_pdf_page():
5962
expected_image = Image.open(TEST_DATA_DIR / "Financial Market Analysis Report 2023_page2_figure.png")
6063
assert_image_equal(cropped_image, expected_image)
6164

65+
# TODO: assert bbox pixels too
66+
6267

6368
def test_table_to_html():
6469
table = DocumentTable(
@@ -106,19 +111,20 @@ def test_table_to_html_with_spans():
106111

107112

108113
@pytest.mark.asyncio
109-
async def test_figure_to_html_without_bounding_regions():
114+
async def test_process_figure_without_bounding_regions():
110115
doc = MagicMock()
111116
figure = DocumentFigure(id="1", caption=None, bounding_regions=None)
112-
cu_describer = MagicMock()
117+
media_describer = MagicMock()
113118

114-
result_html = await DocumentAnalysisParser.figure_to_html(doc, figure, cu_describer)
119+
result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer)
115120
expected_html = "<figure><figcaption></figcaption></figure>"
116121

117-
assert result_html == expected_html
122+
assert isinstance(result, ImageOnPage)
123+
assert result.description == expected_html
118124

119125

120126
@pytest.mark.asyncio
121-
async def test_figure_to_html_with_bounding_regions(monkeypatch, caplog):
127+
async def test_process_figure_with_bounding_regions(monkeypatch, caplog):
122128
doc = MagicMock()
123129
figure = DocumentFigure(
124130
id="1",
@@ -128,25 +134,32 @@ async def test_figure_to_html_with_bounding_regions(monkeypatch, caplog):
128134
BoundingRegion(page_number=2, polygon=[1.4703, 2.8371, 5.5409, 2.8415, 5.5381, 6.6022, 1.4681, 6.5978]),
129135
],
130136
)
131-
cu_describer = AsyncMock()
137+
media_describer = AsyncMock()
132138

133139
async def mock_describe_image(image_bytes):
134140
assert image_bytes == b"image_bytes"
135141
return "Described Image"
136142

137-
monkeypatch.setattr(cu_describer, "describe_image", mock_describe_image)
143+
monkeypatch.setattr(media_describer, "describe_image", mock_describe_image)
138144

139-
def mock_crop_image_from_pdf_page(doc, page_number, bounding_box) -> bytes:
145+
def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
140146
assert page_number == 0
141147
assert bounding_box == (1.4703, 2.8371, 5.5381, 6.6022)
142-
return b"image_bytes"
148+
return b"image_bytes", [10, 20, 30, 40]
143149

144150
monkeypatch.setattr(DocumentAnalysisParser, "crop_image_from_pdf_page", mock_crop_image_from_pdf_page)
145151

146152
with caplog.at_level(logging.WARNING):
147-
result_html = await DocumentAnalysisParser.figure_to_html(doc, figure, cu_describer)
153+
result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer)
148154
expected_html = "<figure><figcaption>Figure 1<br>Described Image</figcaption></figure>"
149-
assert result_html == expected_html
155+
156+
assert isinstance(result, ImageOnPage)
157+
assert result.description == expected_html
158+
assert result.bytes == b"image_bytes"
159+
assert result.page_num == 0
160+
assert result.figure_id == "1"
161+
assert result.bbox == [10, 20, 30, 40]
162+
assert result.filename == "figure1.png"
150163
assert "Figure 1 has more than one bounding region, using the first one" in caplog.text
151164

152165

@@ -169,7 +182,9 @@ async def mock_poller_result():
169182
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
170183

171184
parser = DocumentAnalysisParser(
172-
endpoint="https://example.com", credential=MockAzureCredential(), use_content_understanding=False
185+
endpoint="https://example.com",
186+
credential=MockAzureCredential(),
187+
media_description_strategy=MediaDescriptionStrategy.NONE,
173188
)
174189
content = io.BytesIO(b"pdf content bytes")
175190
content.name = "test.pdf"
@@ -240,7 +255,9 @@ async def mock_poller_result():
240255
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
241256

242257
parser = DocumentAnalysisParser(
243-
endpoint="https://example.com", credential=MockAzureCredential(), use_content_understanding=False
258+
endpoint="https://example.com",
259+
credential=MockAzureCredential(),
260+
media_description_strategy=MediaDescriptionStrategy.NONE,
244261
)
245262
with open(TEST_DATA_DIR / "Simple Table.pdf", "rb") as f:
246263
content = io.BytesIO(f.read())
@@ -293,7 +310,7 @@ async def mock_describe_image(self, image_bytes):
293310
parser = DocumentAnalysisParser(
294311
endpoint="https://example.com",
295312
credential=MockAzureCredential(),
296-
use_content_understanding=True,
313+
media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING,
297314
content_understanding_endpoint="https://example.com",
298315
)
299316

@@ -357,7 +374,7 @@ async def mock_poller_result():
357374
parser = DocumentAnalysisParser(
358375
endpoint="https://example.com",
359376
credential=MockAzureCredential(),
360-
use_content_understanding=True,
377+
media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING,
361378
content_understanding_endpoint="https://example.com",
362379
)
363380
content = io.BytesIO(b"pdf content bytes")

tests/test_prepdocslib_filestrategy.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ async def mock_upload_documents(self, documents):
7474
"content": "texttext",
7575
"category": None,
7676
"groups": ["A-GROUP-ID"],
77+
"images": [],
7778
"oids": ["A-USER-ID"],
7879
"sourcepage": "a.txt",
7980
"sourcefile": "a.txt",
@@ -84,6 +85,7 @@ async def mock_upload_documents(self, documents):
8485
"content": "texttext",
8586
"category": None,
8687
"groups": ["B-GROUP-ID"],
88+
"images": [],
8789
"oids": ["B-USER-ID"],
8890
"sourcepage": "b.txt",
8991
"sourcefile": "b.txt",
@@ -94,6 +96,7 @@ async def mock_upload_documents(self, documents):
9496
"content": "texttext",
9597
"category": None,
9698
"groups": ["C-GROUP-ID"],
99+
"images": [],
97100
"oids": ["C-USER-ID"],
98101
"sourcepage": "c.txt",
99102
"sourcefile": "c.txt",

todo.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ TODO:
66
* Add vectorizer for images field - special from https://learn.microsoft.com/en-us/azure/search/vector-search-vectorizer-ai-services-vision
77
* Update the approaches to set image_sources appropriately in run_agentic_retrieval_approach
88
* Test with integrated vectorization
9+
* Update all TODOs in the code/docs

0 commit comments

Comments
 (0)