diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
index 666ca33b2c..d8c69d000f 100644
--- a/app/backend/prepdocslib/pdfparser.py
+++ b/app/backend/prepdocslib/pdfparser.py
@@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
async with DocumentIntelligenceClient(
endpoint=self.endpoint, credential=self.credential
) as document_intelligence_client:
- file_analyzed = False
+ # Always convert to bytes up front to avoid passing a FileStorage/stream object
+ try:
+ content.seek(0)
+ except Exception:
+ pass
+ content_bytes = content.read()
+
+ poller = None
+ doc_for_pymupdf = None
+
if self.process_figures:
- content_bytes = content.read()
try:
poller = await document_intelligence_client.begin_analyze_document(
model_id="prebuilt-layout",
- analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
+ body=AnalyzeDocumentRequest(bytes_source=content_bytes),
output=["figures"],
features=["ocrHighResolution"],
output_content_format="markdown",
)
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
- file_analyzed = True
except HttpResponseError as e:
- content.seek(0)
if e.error and e.error.code == "InvalidArgument":
logger.error(
"This document type does not support media description. Proceeding with standard analysis."
@@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
e,
)
+ poller = None
- if file_analyzed is False:
+ if poller is None:
poller = await document_intelligence_client.begin_analyze_document(
- model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
+ model_id=self.model_id,
+ body=AnalyzeDocumentRequest(bytes_source=content_bytes),
)
analyze_result: AnalyzeResult = await poller.result()
diff --git a/app/backend/requirements.in b/app/backend/requirements.in
index ba8af3ef36..9b073d704e 100644
--- a/app/backend/requirements.in
+++ b/app/backend/requirements.in
@@ -5,7 +5,7 @@ quart-cors
openai>=1.109.1
tiktoken
tenacity
-azure-ai-documentintelligence==1.0.0b4
+azure-ai-documentintelligence==1.0.2
azure-cognitiveservices-speech
azure-cosmos
azure-search-documents==11.7.0b1
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
index ccdb3b8a00..224813b651 100644
--- a/app/backend/requirements.txt
+++ b/app/backend/requirements.txt
@@ -24,7 +24,7 @@ async-timeout==5.0.1
# via aiohttp
attrs==25.3.0
# via aiohttp
-azure-ai-documentintelligence==1.0.0b4
+azure-ai-documentintelligence==1.0.2
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py
index 23cd2dcabf..4a7017e22c 100644
--- a/tests/test_pdfparser.py
+++ b/tests/test_pdfparser.py
@@ -9,6 +9,7 @@
import pytest
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
+ AnalyzeDocumentRequest,
AnalyzeResult,
BoundingRegion,
DocumentCaption,
@@ -21,6 +22,7 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from PIL import Image, ImageChops
+from werkzeug.datastructures import FileStorage
from prepdocslib.figureprocessor import (
FigureProcessor,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
@pytest.mark.asyncio
async def test_parse_simple(monkeypatch):
mock_poller = MagicMock()
+ captured_bodies: list[AnalyzeDocumentRequest] = []
- async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ body = kwargs["body"]
+ captured_bodies.append(body)
return mock_poller
async def mock_poller_result():
@@ -205,13 +210,106 @@ async def mock_poller_result():
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
+ assert len(captured_bodies) == 1
+ assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+ assert captured_bodies[0].bytes_source == b"pdf content bytes"
+
+
+@pytest.mark.asyncio
+async def test_parse_with_filestorage(monkeypatch):
+ mock_poller = MagicMock()
+ captured_bodies: list[AnalyzeDocumentRequest] = []
+
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ captured_bodies.append(kwargs["body"])
+ return mock_poller
+
+ async def mock_poller_result():
+ return AnalyzeResult(
+ content="Page content",
+ pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
+ tables=[],
+ figures=[],
+ )
+
+ monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
+ monkeypatch.setattr(mock_poller, "result", mock_poller_result)
+
+ parser = DocumentAnalysisParser(
+ endpoint="https://example.com",
+ credential=MockAzureCredential(),
+ )
+ stream = io.BytesIO(b"pdf content bytes")
+ file_storage = FileStorage(stream=stream, filename="upload.pdf")
+ file_storage.name = "upload.pdf"
+ pages = [page async for page in parser.parse(file_storage)]
+
+ assert len(pages) == 1
+ assert pages[0].page_num == 0
+ assert pages[0].offset == 0
+ assert pages[0].text == "Page content"
+ assert len(captured_bodies) == 1
+ assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+ assert captured_bodies[0].bytes_source == b"pdf content bytes"
+
+
+@pytest.mark.asyncio
+async def test_parse_with_non_seekable_stream(monkeypatch):
+ mock_poller = MagicMock()
+ captured_bodies: list[AnalyzeDocumentRequest] = []
+
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ captured_bodies.append(kwargs["body"])
+ return mock_poller
+
+ async def mock_poller_result():
+ return AnalyzeResult(
+ content="Page content",
+ pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
+ tables=[],
+ figures=[],
+ )
+
+ monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
+ monkeypatch.setattr(mock_poller, "result", mock_poller_result)
+
+ class NonSeekableStream:
+ def __init__(self, data: bytes, name: str):
+ self._data = data
+ self._name = name
+ self._consumed = False
+
+ @property
+ def name(self) -> str: # type: ignore[override]
+ return self._name
+
+ def read(self) -> bytes:
+ return self._data
+
+ parser = DocumentAnalysisParser(
+ endpoint="https://example.com",
+ credential=MockAzureCredential(),
+ )
+
+ stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf")
+ pages = [page async for page in parser.parse(stream)]
+
+ assert len(pages) == 1
+ assert pages[0].page_num == 0
+ assert pages[0].offset == 0
+ assert pages[0].text == "Page content"
+ assert len(captured_bodies) == 1
+ assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
+ assert captured_bodies[0].bytes_source == b"pdf content bytes"
@pytest.mark.asyncio
async def test_parse_doc_with_tables(monkeypatch):
mock_poller = MagicMock()
+ captured_bodies: list[AnalyzeDocumentRequest] = []
- async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ captured_bodies.append(kwargs["body"])
return mock_poller
async def mock_poller_result():
@@ -281,13 +379,17 @@ async def mock_poller_result():
pages[0].text
== "# Simple HTML Table\n\n\n| Header 1 | Header 2 |
|---|
| Cell 1 | Cell 2 |
| Cell 3 | Cell 4 |
"
)
+ assert len(captured_bodies) == 1
+ assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
@pytest.mark.asyncio
async def test_parse_doc_with_figures(monkeypatch):
mock_poller = MagicMock()
+ captured_kwargs: list[dict] = []
- async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ captured_kwargs.append(kwargs)
return mock_poller
async def mock_poller_result():
@@ -330,13 +432,20 @@ async def mock_poller_result():
== '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n\n\n\nThis is text after the figure that\'s not part of it.'
)
assert pages[0].images[0].placeholder == ''
+ assert len(captured_kwargs) == 1
+ body = captured_kwargs[0]["body"]
+ assert isinstance(body, AnalyzeDocumentRequest)
+ assert captured_kwargs[0]["output"] == ["figures"]
+ assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
@pytest.mark.asyncio
async def test_parse_unsupportedformat(monkeypatch, caplog):
mock_poller = MagicMock()
+ captured_kwargs: list[dict] = []
- async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
+ async def mock_begin_analyze_document(self, model_id, **kwargs):
+ captured_kwargs.append(kwargs)
if kwargs.get("features") == ["ocrHighResolution"]:
@@ -387,6 +496,11 @@ async def mock_poller_result():
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
+ assert len(captured_kwargs) == 2
+ assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
+ assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
+ assert captured_kwargs[1].get("features") is None
+ assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)
@pytest.mark.asyncio