Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions app/backend/prepdocslib/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
async with DocumentIntelligenceClient(
endpoint=self.endpoint, credential=self.credential
) as document_intelligence_client:
file_analyzed = False
# Always convert to bytes up front to avoid passing a FileStorage/stream object
try:
content.seek(0)
except Exception:
pass
content_bytes = content.read()

poller = None
doc_for_pymupdf = None

if self.process_figures:
content_bytes = content.read()
try:
poller = await document_intelligence_client.begin_analyze_document(
model_id="prebuilt-layout",
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
output=["figures"],
features=["ocrHighResolution"],
output_content_format="markdown",
)
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
file_analyzed = True
except HttpResponseError as e:
content.seek(0)
if e.error and e.error.code == "InvalidArgument":
logger.error(
"This document type does not support media description. Proceeding with standard analysis."
Expand All @@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
e,
)
poller = None

if file_analyzed is False:
if poller is None:
poller = await document_intelligence_client.begin_analyze_document(
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
model_id=self.model_id,
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
)
analyze_result: AnalyzeResult = await poller.result()

Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ quart-cors
openai>=1.109.1
tiktoken
tenacity
azure-ai-documentintelligence==1.0.0b4
azure-ai-documentintelligence==1.0.2
azure-cognitiveservices-speech
azure-cosmos
azure-search-documents==11.7.0b1
Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async-timeout==5.0.1
# via aiohttp
attrs==25.3.0
# via aiohttp
azure-ai-documentintelligence==1.0.0b4
azure-ai-documentintelligence==1.0.2
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
Expand Down
122 changes: 118 additions & 4 deletions tests/test_pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
BoundingRegion,
DocumentCaption,
Expand All @@ -21,6 +22,7 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import HttpResponseError
from PIL import Image, ImageChops
from werkzeug.datastructures import FileStorage

from prepdocslib.figureprocessor import (
FigureProcessor,
Expand Down Expand Up @@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
@pytest.mark.asyncio
async def test_parse_simple(monkeypatch):
mock_poller = MagicMock()
captured_bodies: list[AnalyzeDocumentRequest] = []

async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
async def mock_begin_analyze_document(self, model_id, **kwargs):
body = kwargs["body"]
captured_bodies.append(body)
return mock_poller

async def mock_poller_result():
Expand All @@ -205,13 +210,106 @@ async def mock_poller_result():
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
assert len(captured_bodies) == 1
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
assert captured_bodies[0].bytes_source == b"pdf content bytes"


@pytest.mark.asyncio
async def test_parse_with_filestorage(monkeypatch):
mock_poller = MagicMock()
captured_bodies: list[AnalyzeDocumentRequest] = []

async def mock_begin_analyze_document(self, model_id, **kwargs):
captured_bodies.append(kwargs["body"])
return mock_poller

async def mock_poller_result():
return AnalyzeResult(
content="Page content",
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
tables=[],
figures=[],
)

monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
monkeypatch.setattr(mock_poller, "result", mock_poller_result)

parser = DocumentAnalysisParser(
endpoint="https://example.com",
credential=MockAzureCredential(),
)
stream = io.BytesIO(b"pdf content bytes")
file_storage = FileStorage(stream=stream, filename="upload.pdf")
file_storage.name = "upload.pdf"
pages = [page async for page in parser.parse(file_storage)]

assert len(pages) == 1
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
assert len(captured_bodies) == 1
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
assert captured_bodies[0].bytes_source == b"pdf content bytes"


@pytest.mark.asyncio
async def test_parse_with_non_seekable_stream(monkeypatch):
mock_poller = MagicMock()
captured_bodies: list[AnalyzeDocumentRequest] = []

async def mock_begin_analyze_document(self, model_id, **kwargs):
captured_bodies.append(kwargs["body"])
return mock_poller

async def mock_poller_result():
return AnalyzeResult(
content="Page content",
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
tables=[],
figures=[],
)

monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
monkeypatch.setattr(mock_poller, "result", mock_poller_result)

class NonSeekableStream:
def __init__(self, data: bytes, name: str):
self._data = data
self._name = name
self._consumed = False

@property
def name(self) -> str: # type: ignore[override]
return self._name

def read(self) -> bytes:
return self._data

parser = DocumentAnalysisParser(
endpoint="https://example.com",
credential=MockAzureCredential(),
)

stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf")
pages = [page async for page in parser.parse(stream)]

assert len(pages) == 1
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
assert len(captured_bodies) == 1
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
assert captured_bodies[0].bytes_source == b"pdf content bytes"


@pytest.mark.asyncio
async def test_parse_doc_with_tables(monkeypatch):
mock_poller = MagicMock()
captured_bodies: list[AnalyzeDocumentRequest] = []

async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
async def mock_begin_analyze_document(self, model_id, **kwargs):
captured_bodies.append(kwargs["body"])
return mock_poller

async def mock_poller_result():
Expand Down Expand Up @@ -281,13 +379,17 @@ async def mock_poller_result():
pages[0].text
== "# Simple HTML Table\n\n\n<figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
)
assert len(captured_bodies) == 1
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)


@pytest.mark.asyncio
async def test_parse_doc_with_figures(monkeypatch):
mock_poller = MagicMock()
captured_kwargs: list[dict] = []

async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
async def mock_begin_analyze_document(self, model_id, **kwargs):
captured_kwargs.append(kwargs)
return mock_poller

async def mock_poller_result():
Expand Down Expand Up @@ -330,13 +432,20 @@ async def mock_poller_result():
== '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure id="1.1"></figure>\n\n\nThis is text after the figure that\'s not part of it.'
)
assert pages[0].images[0].placeholder == '<figure id="1.1"></figure>'
assert len(captured_kwargs) == 1
body = captured_kwargs[0]["body"]
assert isinstance(body, AnalyzeDocumentRequest)
assert captured_kwargs[0]["output"] == ["figures"]
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]


@pytest.mark.asyncio
async def test_parse_unsupportedformat(monkeypatch, caplog):
mock_poller = MagicMock()
captured_kwargs: list[dict] = []

async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
async def mock_begin_analyze_document(self, model_id, **kwargs):
captured_kwargs.append(kwargs)

if kwargs.get("features") == ["ocrHighResolution"]:

Expand Down Expand Up @@ -387,6 +496,11 @@ async def mock_poller_result():
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "Page content"
assert len(captured_kwargs) == 2
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
assert captured_kwargs[1].get("features") is None
assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)


@pytest.mark.asyncio
Expand Down