22
22
from PIL import Image , ImageChops
23
23
24
24
from prepdocslib .mediadescriber import ContentUnderstandingDescriber
25
- from prepdocslib .pdfparser import DocumentAnalysisParser
25
+ from prepdocslib .page import ImageOnPage
26
+ from prepdocslib .pdfparser import DocumentAnalysisParser , MediaDescriptionStrategy
26
27
27
28
from .mocks import MockAzureCredential
28
29
@@ -44,11 +45,13 @@ def test_crop_image_from_pdf_page():
44
45
page_number = 2
45
46
bounding_box = (1.4703 , 2.8371 , 5.5381 , 6.6022 ) # Coordinates in inches
46
47
47
- cropped_image_bytes = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number , bounding_box )
48
+ cropped_image_bytes , bbox_pixels = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number , bounding_box )
48
49
49
50
# Verify the output is not empty
50
51
assert cropped_image_bytes is not None
51
52
assert len (cropped_image_bytes ) > 0
53
+ assert bbox_pixels is not None
54
+ assert len (bbox_pixels ) == 4
52
55
53
56
# Verify the output is a valid image
54
57
cropped_image = Image .open (io .BytesIO (cropped_image_bytes ))
@@ -59,6 +62,8 @@ def test_crop_image_from_pdf_page():
59
62
expected_image = Image .open (TEST_DATA_DIR / "Financial Market Analysis Report 2023_page2_figure.png" )
60
63
assert_image_equal (cropped_image , expected_image )
61
64
65
+ # TODO: assert bbox pixels too
66
+
62
67
63
68
def test_table_to_html ():
64
69
table = DocumentTable (
@@ -106,19 +111,20 @@ def test_table_to_html_with_spans():
106
111
107
112
108
113
@pytest .mark .asyncio
109
- async def test_figure_to_html_without_bounding_regions ():
114
+ async def test_process_figure_without_bounding_regions ():
110
115
doc = MagicMock ()
111
116
figure = DocumentFigure (id = "1" , caption = None , bounding_regions = None )
112
- cu_describer = MagicMock ()
117
+ media_describer = MagicMock ()
113
118
114
- result_html = await DocumentAnalysisParser .figure_to_html (doc , figure , cu_describer )
119
+ result = await DocumentAnalysisParser .process_figure (doc , figure , media_describer )
115
120
expected_html = "<figure><figcaption></figcaption></figure>"
116
121
117
- assert result_html == expected_html
122
+ assert isinstance (result , ImageOnPage )
123
+ assert result .description == expected_html
118
124
119
125
120
126
@pytest .mark .asyncio
121
- async def test_figure_to_html_with_bounding_regions (monkeypatch , caplog ):
127
+ async def test_process_figure_with_bounding_regions (monkeypatch , caplog ):
122
128
doc = MagicMock ()
123
129
figure = DocumentFigure (
124
130
id = "1" ,
@@ -128,25 +134,32 @@ async def test_figure_to_html_with_bounding_regions(monkeypatch, caplog):
128
134
BoundingRegion (page_number = 2 , polygon = [1.4703 , 2.8371 , 5.5409 , 2.8415 , 5.5381 , 6.6022 , 1.4681 , 6.5978 ]),
129
135
],
130
136
)
131
- cu_describer = AsyncMock ()
137
+ media_describer = AsyncMock ()
132
138
133
139
async def mock_describe_image (image_bytes ):
134
140
assert image_bytes == b"image_bytes"
135
141
return "Described Image"
136
142
137
- monkeypatch .setattr (cu_describer , "describe_image" , mock_describe_image )
143
+ monkeypatch .setattr (media_describer , "describe_image" , mock_describe_image )
138
144
139
- def mock_crop_image_from_pdf_page (doc , page_number , bounding_box ) -> bytes :
145
+ def mock_crop_image_from_pdf_page (doc , page_number , bounding_box ):
140
146
assert page_number == 0
141
147
assert bounding_box == (1.4703 , 2.8371 , 5.5381 , 6.6022 )
142
- return b"image_bytes"
148
+ return b"image_bytes" , [ 10 , 20 , 30 , 40 ]
143
149
144
150
monkeypatch .setattr (DocumentAnalysisParser , "crop_image_from_pdf_page" , mock_crop_image_from_pdf_page )
145
151
146
152
with caplog .at_level (logging .WARNING ):
147
- result_html = await DocumentAnalysisParser .figure_to_html (doc , figure , cu_describer )
153
+ result = await DocumentAnalysisParser .process_figure (doc , figure , media_describer )
148
154
expected_html = "<figure><figcaption>Figure 1<br>Described Image</figcaption></figure>"
149
- assert result_html == expected_html
155
+
156
+ assert isinstance (result , ImageOnPage )
157
+ assert result .description == expected_html
158
+ assert result .bytes == b"image_bytes"
159
+ assert result .page_num == 0
160
+ assert result .figure_id == "1"
161
+ assert result .bbox == [10 , 20 , 30 , 40 ]
162
+ assert result .filename == "figure1.png"
150
163
assert "Figure 1 has more than one bounding region, using the first one" in caplog .text
151
164
152
165
@@ -169,7 +182,9 @@ async def mock_poller_result():
169
182
monkeypatch .setattr (mock_poller , "result" , mock_poller_result )
170
183
171
184
parser = DocumentAnalysisParser (
172
- endpoint = "https://example.com" , credential = MockAzureCredential (), use_content_understanding = False
185
+ endpoint = "https://example.com" ,
186
+ credential = MockAzureCredential (),
187
+ media_description_strategy = MediaDescriptionStrategy .NONE ,
173
188
)
174
189
content = io .BytesIO (b"pdf content bytes" )
175
190
content .name = "test.pdf"
@@ -240,7 +255,9 @@ async def mock_poller_result():
240
255
monkeypatch .setattr (mock_poller , "result" , mock_poller_result )
241
256
242
257
parser = DocumentAnalysisParser (
243
- endpoint = "https://example.com" , credential = MockAzureCredential (), use_content_understanding = False
258
+ endpoint = "https://example.com" ,
259
+ credential = MockAzureCredential (),
260
+ media_description_strategy = MediaDescriptionStrategy .NONE ,
244
261
)
245
262
with open (TEST_DATA_DIR / "Simple Table.pdf" , "rb" ) as f :
246
263
content = io .BytesIO (f .read ())
@@ -293,7 +310,7 @@ async def mock_describe_image(self, image_bytes):
293
310
parser = DocumentAnalysisParser (
294
311
endpoint = "https://example.com" ,
295
312
credential = MockAzureCredential (),
296
- use_content_understanding = True ,
313
+ media_description_strategy = MediaDescriptionStrategy . CONTENTUNDERSTANDING ,
297
314
content_understanding_endpoint = "https://example.com" ,
298
315
)
299
316
@@ -357,7 +374,7 @@ async def mock_poller_result():
357
374
parser = DocumentAnalysisParser (
358
375
endpoint = "https://example.com" ,
359
376
credential = MockAzureCredential (),
360
- use_content_understanding = True ,
377
+ media_description_strategy = MediaDescriptionStrategy . CONTENTUNDERSTANDING ,
361
378
content_understanding_endpoint = "https://example.com" ,
362
379
)
363
380
content = io .BytesIO (b"pdf content bytes" )
0 commit comments