|
21 | 21 | from azure.core.exceptions import HttpResponseError
|
22 | 22 | from PIL import Image, ImageChops
|
23 | 23 |
|
24 |
| -from prepdocslib.mediadescriber import ContentUnderstandingDescriber |
| 24 | +from prepdocslib.mediadescriber import ( |
| 25 | + ContentUnderstandingDescriber, |
| 26 | + MultimodalModelDescriber, |
| 27 | +) |
25 | 28 | from prepdocslib.page import ImageOnPage
|
26 | 29 | from prepdocslib.pdfparser import DocumentAnalysisParser, MediaDescriptionStrategy
|
27 | 30 |
|
@@ -386,3 +389,78 @@ async def mock_poller_result():
|
386 | 389 | assert pages[0].page_num == 0
|
387 | 390 | assert pages[0].offset == 0
|
388 | 391 | assert pages[0].text == "Page content"
|
| 392 | + |
| 393 | + |
| 394 | +@pytest.mark.asyncio |
| 395 | +async def test_parse_doc_with_openai(monkeypatch): |
| 396 | + mock_poller = MagicMock() |
| 397 | + |
| 398 | + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): |
| 399 | + return mock_poller |
| 400 | + |
| 401 | + async def mock_poller_result(): |
| 402 | + content = open(TEST_DATA_DIR / "Simple Figure_content.txt").read() |
| 403 | + return AnalyzeResult( |
| 404 | + content=content, |
| 405 | + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=148)])], |
| 406 | + figures=[ |
| 407 | + DocumentFigure( |
| 408 | + id="1.1", |
| 409 | + caption=DocumentCaption(content="Figure 1"), |
| 410 | + bounding_regions=[ |
| 411 | + BoundingRegion( |
| 412 | + page_number=1, polygon=[0.4295, 1.3072, 1.7071, 1.3076, 1.7067, 2.6088, 0.4291, 2.6085] |
| 413 | + ) |
| 414 | + ], |
| 415 | + spans=[DocumentSpan(offset=70, length=22)], |
| 416 | + ) |
| 417 | + ], |
| 418 | + ) |
| 419 | + |
| 420 | + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) |
| 421 | + monkeypatch.setattr(mock_poller, "result", mock_poller_result) |
| 422 | + |
| 423 | + async def mock_describe_image(self, image_bytes): |
| 424 | + return "Pie chart" |
| 425 | + |
| 426 | + monkeypatch.setattr(MultimodalModelDescriber, "describe_image", mock_describe_image) |
| 427 | + |
| 428 | + parser = DocumentAnalysisParser( |
| 429 | + endpoint="https://example.com", |
| 430 | + credential=MockAzureCredential(), |
| 431 | + media_description_strategy=MediaDescriptionStrategy.OPENAI, |
| 432 | + openai_client=Mock(), |
| 433 | + openai_model="gpt-4o", |
| 434 | + openai_deployment="gpt-4o", |
| 435 | + ) |
| 436 | + |
| 437 | + with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: |
| 438 | + content = io.BytesIO(f.read()) |
| 439 | + content.name = "Simple Figure.pdf" |
| 440 | + |
| 441 | + pages = [page async for page in parser.parse(content)] |
| 442 | + |
| 443 | + assert len(pages) == 1 |
| 444 | + assert pages[0].page_num == 0 |
| 445 | + assert pages[0].offset == 0 |
| 446 | + assert ( |
| 447 | + pages[0].text |
| 448 | + == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure><figcaption>1.1 Figure 1<br>Pie chart</figcaption></figure>\n\n\nThis is text after the figure that's not part of it." |
| 449 | + ) |
| 450 | + |
| 451 | + |
| 452 | +@pytest.mark.asyncio |
| 453 | +async def test_parse_doc_with_openai_missing_parameters(): |
| 454 | + parser = DocumentAnalysisParser( |
| 455 | + endpoint="https://example.com", |
| 456 | + credential=MockAzureCredential(), |
| 457 | + media_description_strategy=MediaDescriptionStrategy.OPENAI, |
| 458 | + # Intentionally not providing openai_client and openai_model |
| 459 | + ) |
| 460 | + |
| 461 | + content = io.BytesIO(b"pdf content bytes") |
| 462 | + content.name = "test.pdf" |
| 463 | + |
| 464 | + with pytest.raises(ValueError, match="OpenAI client must be provided when using OpenAI media description strategy"): |
| 465 | + # Call the first iteration of the generator without using async for |
| 466 | + await parser.parse(content).__anext__() |
0 commit comments