99import pytest
1010from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
1111from azure .ai .documentintelligence .models import (
12+ AnalyzeDocumentRequest ,
1213 AnalyzeResult ,
1314 BoundingRegion ,
1415 DocumentCaption ,
2122from azure .core .credentials import AzureKeyCredential
2223from azure .core .exceptions import HttpResponseError
2324from PIL import Image , ImageChops
25+ from werkzeug .datastructures import FileStorage
2426
2527from prepdocslib .figureprocessor import (
2628 FigureProcessor ,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
178180@pytest .mark .asyncio
179181async def test_parse_simple (monkeypatch ):
180182 mock_poller = MagicMock ()
183+ captured_bodies : list [AnalyzeDocumentRequest ] = []
181184
182- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
185+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
186+ body = kwargs ["body" ]
187+ captured_bodies .append (body )
183188 return mock_poller
184189
185190 async def mock_poller_result ():
@@ -205,13 +210,56 @@ async def mock_poller_result():
205210 assert pages [0 ].page_num == 0
206211 assert pages [0 ].offset == 0
207212 assert pages [0 ].text == "Page content"
213+ assert len (captured_bodies ) == 1
214+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
215+ assert captured_bodies [0 ].bytes_source == b"pdf content bytes"
216+
217+
218+ @pytest .mark .asyncio
219+ async def test_parse_with_filestorage (monkeypatch ):
220+ mock_poller = MagicMock ()
221+ captured_bodies : list [AnalyzeDocumentRequest ] = []
222+
223+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
224+ captured_bodies .append (kwargs ["body" ])
225+ return mock_poller
226+
227+ async def mock_poller_result ():
228+ return AnalyzeResult (
229+ content = "Page content" ,
230+ pages = [DocumentPage (page_number = 1 , spans = [DocumentSpan (offset = 0 , length = 12 )])],
231+ tables = [],
232+ figures = [],
233+ )
234+
235+ monkeypatch .setattr (DocumentIntelligenceClient , "begin_analyze_document" , mock_begin_analyze_document )
236+ monkeypatch .setattr (mock_poller , "result" , mock_poller_result )
237+
238+ parser = DocumentAnalysisParser (
239+ endpoint = "https://example.com" ,
240+ credential = MockAzureCredential (),
241+ )
242+ stream = io .BytesIO (b"pdf content bytes" )
243+ file_storage = FileStorage (stream = stream , filename = "upload.pdf" )
244+ file_storage .name = "upload.pdf"
245+ pages = [page async for page in parser .parse (file_storage )]
246+
247+ assert len (pages ) == 1
248+ assert pages [0 ].page_num == 0
249+ assert pages [0 ].offset == 0
250+ assert pages [0 ].text == "Page content"
251+ assert len (captured_bodies ) == 1
252+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
253+ assert captured_bodies [0 ].bytes_source == b"pdf content bytes"
208254
209255
210256@pytest .mark .asyncio
211257async def test_parse_doc_with_tables (monkeypatch ):
212258 mock_poller = MagicMock ()
259+ captured_bodies : list [AnalyzeDocumentRequest ] = []
213260
214- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
261+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
262+ captured_bodies .append (kwargs ["body" ])
215263 return mock_poller
216264
217265 async def mock_poller_result ():
@@ -281,13 +329,17 @@ async def mock_poller_result():
281329 pages [0 ].text
282330 == "# Simple HTML Table\n \n \n <figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
283331 )
332+ assert len (captured_bodies ) == 1
333+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
284334
285335
286336@pytest .mark .asyncio
287337async def test_parse_doc_with_figures (monkeypatch ):
288338 mock_poller = MagicMock ()
339+ captured_kwargs : list [dict ] = []
289340
290- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
341+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
342+ captured_kwargs .append (kwargs )
291343 return mock_poller
292344
293345 async def mock_poller_result ():
@@ -330,13 +382,20 @@ async def mock_poller_result():
330382 == '# Simple Figure\n \n This text is before the figure and NOT part of it.\n \n \n <figure id="1.1"></figure>\n \n \n This is text after the figure that\' s not part of it.'
331383 )
332384 assert pages [0 ].images [0 ].placeholder == '<figure id="1.1"></figure>'
385+ assert len (captured_kwargs ) == 1
386+ body = captured_kwargs [0 ]["body" ]
387+ assert isinstance (body , AnalyzeDocumentRequest )
388+ assert captured_kwargs [0 ]["output" ] == ["figures" ]
389+ assert captured_kwargs [0 ]["features" ] == ["ocrHighResolution" ]
333390
334391
335392@pytest .mark .asyncio
336393async def test_parse_unsupportedformat (monkeypatch , caplog ):
337394 mock_poller = MagicMock ()
395+ captured_kwargs : list [dict ] = []
338396
339- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
397+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
398+ captured_kwargs .append (kwargs )
340399
341400 if kwargs .get ("features" ) == ["ocrHighResolution" ]:
342401
@@ -387,6 +446,11 @@ async def mock_poller_result():
387446 assert pages [0 ].page_num == 0
388447 assert pages [0 ].offset == 0
389448 assert pages [0 ].text == "Page content"
449+ assert len (captured_kwargs ) == 2
450+ assert captured_kwargs [0 ]["features" ] == ["ocrHighResolution" ]
451+ assert isinstance (captured_kwargs [0 ]["body" ], AnalyzeDocumentRequest )
452+ assert captured_kwargs [1 ].get ("features" ) is None
453+ assert isinstance (captured_kwargs [1 ]["body" ], AnalyzeDocumentRequest )
390454
391455
392456@pytest .mark .asyncio
0 commit comments