diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 666ca33b2c..d8c69d000f 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: async with DocumentIntelligenceClient( endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: - file_analyzed = False + # Always convert to bytes up front to avoid passing a FileStorage/stream object + try: + content.seek(0) + except Exception: + pass + content_bytes = content.read() + + poller = None + doc_for_pymupdf = None + if self.process_figures: - content_bytes = content.read() try: poller = await document_intelligence_client.begin_analyze_document( model_id="prebuilt-layout", - analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes), + body=AnalyzeDocumentRequest(bytes_source=content_bytes), output=["figures"], features=["ocrHighResolution"], output_content_format="markdown", ) doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) - file_analyzed = True except HttpResponseError as e: - content.seek(0) if e.error and e.error.code == "InvalidArgument": logger.error( "This document type does not support media description. Proceeding with standard analysis." @@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: "Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.", e, ) + poller = None - if file_analyzed is False: + if poller is None: poller = await document_intelligence_client.begin_analyze_document( - model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" + model_id=self.model_id, + body=AnalyzeDocumentRequest(bytes_source=content_bytes), ) analyze_result: AnalyzeResult = await poller.result() diff --git a/app/backend/requirements.in b/app/backend/requirements.in index ba8af3ef36..9b073d704e 100644 --- a/app/backend/requirements.in +++ b/app/backend/requirements.in @@ -5,7 +5,7 @@ quart-cors openai>=1.109.1 tiktoken tenacity -azure-ai-documentintelligence==1.0.0b4 +azure-ai-documentintelligence==1.0.2 azure-cognitiveservices-speech azure-cosmos azure-search-documents==11.7.0b1 diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index ccdb3b8a00..224813b651 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -24,7 +24,7 @@ async-timeout==5.0.1 # via aiohttp attrs==25.3.0 # via aiohttp -azure-ai-documentintelligence==1.0.0b4 +azure-ai-documentintelligence==1.0.2 # via -r requirements.in azure-cognitiveservices-speech==1.40.0 # via -r requirements.in diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index 23cd2dcabf..4a7017e22c 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -9,6 +9,7 @@ import pytest from azure.ai.documentintelligence.aio import DocumentIntelligenceClient from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, AnalyzeResult, BoundingRegion, DocumentCaption, @@ -21,6 +22,7 @@ from azure.core.credentials import AzureKeyCredential from azure.core.exceptions import HttpResponseError from PIL import Image, ImageChops +from werkzeug.datastructures import FileStorage from prepdocslib.figureprocessor import ( FigureProcessor, @@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): @pytest.mark.asyncio async def test_parse_simple(monkeypatch): mock_poller = MagicMock() + captured_bodies: list[AnalyzeDocumentRequest] = [] - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + async def mock_begin_analyze_document(self, model_id, **kwargs): + body = kwargs["body"] + captured_bodies.append(body) return mock_poller async def mock_poller_result(): @@ -205,13 +210,106 @@ async def mock_poller_result(): assert pages[0].page_num == 0 assert pages[0].offset == 0 assert pages[0].text == "Page content" + assert len(captured_bodies) == 1 + assert isinstance(captured_bodies[0], AnalyzeDocumentRequest) + assert captured_bodies[0].bytes_source == b"pdf content bytes" + + +@pytest.mark.asyncio +async def test_parse_with_filestorage(monkeypatch): + mock_poller = MagicMock() + captured_bodies: list[AnalyzeDocumentRequest] = [] + + async def mock_begin_analyze_document(self, model_id, **kwargs): + captured_bodies.append(kwargs["body"]) + return mock_poller + + async def mock_poller_result(): + return AnalyzeResult( + content="Page content", + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])], + tables=[], + figures=[], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + parser = DocumentAnalysisParser( + endpoint="https://example.com", + credential=MockAzureCredential(), + ) + stream = io.BytesIO(b"pdf content bytes") + file_storage = FileStorage(stream=stream, filename="upload.pdf") + file_storage.name = "upload.pdf" + pages = [page async for page in parser.parse(file_storage)] + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "Page content" + assert len(captured_bodies) == 1 + assert isinstance(captured_bodies[0], AnalyzeDocumentRequest) + assert captured_bodies[0].bytes_source == b"pdf content bytes" + + +@pytest.mark.asyncio +async def test_parse_with_non_seekable_stream(monkeypatch): + mock_poller = MagicMock() + captured_bodies: list[AnalyzeDocumentRequest] = [] + + async def mock_begin_analyze_document(self, model_id, **kwargs): + captured_bodies.append(kwargs["body"]) + return mock_poller + + async def mock_poller_result(): + return AnalyzeResult( + content="Page content", + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])], + tables=[], + figures=[], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + class NonSeekableStream: + def __init__(self, data: bytes, name: str): + self._data = data + self._name = name + self._consumed = False + + @property + def name(self) -> str: # type: ignore[override] + return self._name + + def read(self) -> bytes: + return self._data + + parser = DocumentAnalysisParser( + endpoint="https://example.com", + credential=MockAzureCredential(), + ) + + stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf") + pages = [page async for page in parser.parse(stream)] + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "Page content" + assert len(captured_bodies) == 1 + assert isinstance(captured_bodies[0], AnalyzeDocumentRequest) + assert captured_bodies[0].bytes_source == b"pdf content bytes" @pytest.mark.asyncio async def test_parse_doc_with_tables(monkeypatch): mock_poller = MagicMock() + captured_bodies: list[AnalyzeDocumentRequest] = [] - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + async def mock_begin_analyze_document(self, model_id, **kwargs): + captured_bodies.append(kwargs["body"]) return mock_poller async def mock_poller_result(): @@ -281,13 +379,17 @@ async def mock_poller_result(): pages[0].text == "# Simple HTML Table\n\n\n
Header 1Header 2
Cell 1Cell 2
Cell 3Cell 4
" ) + assert len(captured_bodies) == 1 + assert isinstance(captured_bodies[0], AnalyzeDocumentRequest) @pytest.mark.asyncio async def test_parse_doc_with_figures(monkeypatch): mock_poller = MagicMock() + captured_kwargs: list[dict] = [] - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + async def mock_begin_analyze_document(self, model_id, **kwargs): + captured_kwargs.append(kwargs) return mock_poller async def mock_poller_result(): @@ -330,13 +432,20 @@ async def mock_poller_result(): == '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
\n\n\nThis is text after the figure that\'s not part of it.' ) assert pages[0].images[0].placeholder == '
' + assert len(captured_kwargs) == 1 + body = captured_kwargs[0]["body"] + assert isinstance(body, AnalyzeDocumentRequest) + assert captured_kwargs[0]["output"] == ["figures"] + assert captured_kwargs[0]["features"] == ["ocrHighResolution"] @pytest.mark.asyncio async def test_parse_unsupportedformat(monkeypatch, caplog): mock_poller = MagicMock() + captured_kwargs: list[dict] = [] - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + async def mock_begin_analyze_document(self, model_id, **kwargs): + captured_kwargs.append(kwargs) if kwargs.get("features") == ["ocrHighResolution"]: @@ -387,6 +496,11 @@ async def mock_poller_result(): assert pages[0].page_num == 0 assert pages[0].offset == 0 assert pages[0].text == "Page content" + assert len(captured_kwargs) == 2 + assert captured_kwargs[0]["features"] == ["ocrHighResolution"] + assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest) + assert captured_kwargs[1].get("features") is None + assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest) @pytest.mark.asyncio