Skip to content

Commit ee6c89b

Browse files
committed
Fix issue with upload
1 parent 86b6485 commit ee6c89b

File tree

5 files changed

+94
-28
lines changed

5 files changed

+94
-28
lines changed

app/backend/app.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -356,17 +356,13 @@ async def upload(auth_claims: dict[str, Any]):
356356
if "file" not in request_files:
357357
return jsonify({"message": "No file part in the request", "status": "failed"}), 400
358358

359-
try:
360-
user_oid = auth_claims["oid"]
361-
file = request_files.getlist("file")[0]
362-
adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
363-
file_url = await adls_manager.upload_blob(file, file.filename, user_oid)
364-
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
365-
await ingester.add_file(File(content=file, url=file_url, acls={"oids": [user_oid]}), user_oid=user_oid)
366-
return jsonify({"message": "File uploaded successfully"}), 200
367-
except Exception as error:
368-
current_app.logger.error("Error uploading file: %s", error)
369-
return jsonify({"message": "Error uploading file, check server logs for details.", "status": "failed"}), 500
359+
user_oid = auth_claims["oid"]
360+
file = request_files.getlist("file")[0]
361+
adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER]
362+
file_url = await adls_manager.upload_blob(file, file.filename, user_oid)
363+
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
364+
await ingester.add_file(File(content=file, url=file_url, acls={"oids": [user_oid]}), user_oid=user_oid)
365+
return jsonify({"message": "File uploaded successfully"}), 200
370366

371367

372368
@bp.post("/delete_uploaded")

app/backend/prepdocslib/pdfparser.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
6868
async with DocumentIntelligenceClient(
6969
endpoint=self.endpoint, credential=self.credential
7070
) as document_intelligence_client:
71-
file_analyzed = False
71+
# Always convert to bytes up front to avoid passing a FileStorage/stream object
72+
try:
73+
content.seek(0)
74+
except Exception:
75+
pass
76+
content_bytes = content.read()
77+
78+
poller = None
79+
doc_for_pymupdf = None
80+
7281
if self.process_figures:
73-
content_bytes = content.read()
7482
try:
7583
poller = await document_intelligence_client.begin_analyze_document(
7684
model_id="prebuilt-layout",
77-
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
85+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
7886
output=["figures"],
7987
features=["ocrHighResolution"],
8088
output_content_format="markdown",
8189
)
8290
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
83-
file_analyzed = True
8491
except HttpResponseError as e:
85-
content.seek(0)
8692
if e.error and e.error.code == "InvalidArgument":
8793
logger.error(
8894
"This document type does not support media description. Proceeding with standard analysis."
@@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
9298
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
9399
e,
94100
)
101+
poller = None
95102

96-
if file_analyzed is False:
103+
if poller is None:
97104
poller = await document_intelligence_client.begin_analyze_document(
98-
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
105+
model_id=self.model_id,
106+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
99107
)
100108
analyze_result: AnalyzeResult = await poller.result()
101109

@@ -122,11 +130,9 @@ class ObjectType(Enum):
122130
TABLE = 0
123131
FIGURE = 1
124132

125-
MaskEntry = tuple[ObjectType, Optional[int]]
126-
127133
page_offset = page.spans[0].offset
128134
page_length = page.spans[0].length
129-
mask_chars: list[MaskEntry] = [(ObjectType.NONE, None)] * page_length
135+
mask_chars: list[tuple[ObjectType, Optional[int]]] = [(ObjectType.NONE, None)] * page_length
130136
# mark all positions of the table spans in the page
131137
for table_idx, table in enumerate(tables_on_page):
132138
for span in table.spans:
@@ -146,7 +152,7 @@ class ObjectType(Enum):
146152

147153
# build page text by replacing characters in table spans with table html
148154
page_text = ""
149-
added_objects: set[MaskEntry] = set()
155+
added_objects: set[tuple[ObjectType, Optional[int]]] = set()
150156
for idx, mask_char in enumerate(mask_chars):
151157
object_type, object_idx = mask_char
152158
if object_type == ObjectType.NONE:

app/backend/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ quart-cors
55
openai>=1.109.1
66
tiktoken
77
tenacity
8-
azure-ai-documentintelligence==1.0.0b4
8+
azure-ai-documentintelligence==1.0.2
99
azure-cognitiveservices-speech
1010
azure-cosmos
1111
azure-search-documents==11.7.0b1

app/backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

tests/test_pdfparser.py

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
1111
from azure.ai.documentintelligence.models import (
12+
AnalyzeDocumentRequest,
1213
AnalyzeResult,
1314
BoundingRegion,
1415
DocumentCaption,
@@ -21,6 +22,7 @@
2122
from azure.core.credentials import AzureKeyCredential
2223
from azure.core.exceptions import HttpResponseError
2324
from PIL import Image, ImageChops
25+
from werkzeug.datastructures import FileStorage
2426

2527
from prepdocslib.figureprocessor import (
2628
FigureProcessor,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
178180
@pytest.mark.asyncio
179181
async def test_parse_simple(monkeypatch):
180182
mock_poller = MagicMock()
183+
captured_bodies: list[AnalyzeDocumentRequest] = []
181184

182-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
185+
async def mock_begin_analyze_document(self, model_id, **kwargs):
186+
body = kwargs["body"]
187+
captured_bodies.append(body)
183188
return mock_poller
184189

185190
async def mock_poller_result():
@@ -205,13 +210,56 @@ async def mock_poller_result():
205210
assert pages[0].page_num == 0
206211
assert pages[0].offset == 0
207212
assert pages[0].text == "Page content"
213+
assert len(captured_bodies) == 1
214+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
215+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
216+
217+
218+
@pytest.mark.asyncio
219+
async def test_parse_with_filestorage(monkeypatch):
220+
mock_poller = MagicMock()
221+
captured_bodies: list[AnalyzeDocumentRequest] = []
222+
223+
async def mock_begin_analyze_document(self, model_id, **kwargs):
224+
captured_bodies.append(kwargs["body"])
225+
return mock_poller
226+
227+
async def mock_poller_result():
228+
return AnalyzeResult(
229+
content="Page content",
230+
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
231+
tables=[],
232+
figures=[],
233+
)
234+
235+
monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
236+
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
237+
238+
parser = DocumentAnalysisParser(
239+
endpoint="https://example.com",
240+
credential=MockAzureCredential(),
241+
)
242+
stream = io.BytesIO(b"pdf content bytes")
243+
file_storage = FileStorage(stream=stream, filename="upload.pdf")
244+
file_storage.name = "upload.pdf"
245+
pages = [page async for page in parser.parse(file_storage)]
246+
247+
assert len(pages) == 1
248+
assert pages[0].page_num == 0
249+
assert pages[0].offset == 0
250+
assert pages[0].text == "Page content"
251+
assert len(captured_bodies) == 1
252+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
253+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
208254

209255

210256
@pytest.mark.asyncio
211257
async def test_parse_doc_with_tables(monkeypatch):
212258
mock_poller = MagicMock()
259+
captured_bodies: list[AnalyzeDocumentRequest] = []
213260

214-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
261+
async def mock_begin_analyze_document(self, model_id, **kwargs):
262+
captured_bodies.append(kwargs["body"])
215263
return mock_poller
216264

217265
async def mock_poller_result():
@@ -281,13 +329,17 @@ async def mock_poller_result():
281329
pages[0].text
282330
== "# Simple HTML Table\n\n\n<figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
283331
)
332+
assert len(captured_bodies) == 1
333+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
284334

285335

286336
@pytest.mark.asyncio
287337
async def test_parse_doc_with_figures(monkeypatch):
288338
mock_poller = MagicMock()
339+
captured_kwargs: list[dict] = []
289340

290-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
341+
async def mock_begin_analyze_document(self, model_id, **kwargs):
342+
captured_kwargs.append(kwargs)
291343
return mock_poller
292344

293345
async def mock_poller_result():
@@ -330,13 +382,20 @@ async def mock_poller_result():
330382
== '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure id="1.1"></figure>\n\n\nThis is text after the figure that\'s not part of it.'
331383
)
332384
assert pages[0].images[0].placeholder == '<figure id="1.1"></figure>'
385+
assert len(captured_kwargs) == 1
386+
body = captured_kwargs[0]["body"]
387+
assert isinstance(body, AnalyzeDocumentRequest)
388+
assert captured_kwargs[0]["output"] == ["figures"]
389+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
333390

334391

335392
@pytest.mark.asyncio
336393
async def test_parse_unsupportedformat(monkeypatch, caplog):
337394
mock_poller = MagicMock()
395+
captured_kwargs: list[dict] = []
338396

339-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
397+
async def mock_begin_analyze_document(self, model_id, **kwargs):
398+
captured_kwargs.append(kwargs)
340399

341400
if kwargs.get("features") == ["ocrHighResolution"]:
342401

@@ -387,6 +446,11 @@ async def mock_poller_result():
387446
assert pages[0].page_num == 0
388447
assert pages[0].offset == 0
389448
assert pages[0].text == "Page content"
449+
assert len(captured_kwargs) == 2
450+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
451+
assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
452+
assert captured_kwargs[1].get("features") is None
453+
assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)
390454

391455

392456
@pytest.mark.asyncio

0 commit comments

Comments
 (0)