Skip to content

Commit 2aa1590

Browse files
Merge branch 'main' into ignite2025
2 parents 92647b4 + 5edb536 commit 2aa1590

File tree

6 files changed

+169
-47
lines changed

6 files changed

+169
-47
lines changed

app/backend/prepdocslib/pdfparser.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,27 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
6868
async with DocumentIntelligenceClient(
6969
endpoint=self.endpoint, credential=self.credential
7070
) as document_intelligence_client:
71-
file_analyzed = False
71+
# Always convert to bytes up front to avoid passing a FileStorage/stream object
72+
try:
73+
content.seek(0)
74+
except Exception:
75+
pass
76+
content_bytes = content.read()
77+
78+
poller = None
79+
doc_for_pymupdf = None
80+
7281
if self.process_figures:
73-
content_bytes = content.read()
7482
try:
7583
poller = await document_intelligence_client.begin_analyze_document(
7684
model_id="prebuilt-layout",
77-
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
85+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
7886
output=["figures"],
7987
features=["ocrHighResolution"],
8088
output_content_format="markdown",
8189
)
8290
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
83-
file_analyzed = True
8491
except HttpResponseError as e:
85-
content.seek(0)
8692
if e.error and e.error.code == "InvalidArgument":
8793
logger.error(
8894
"This document type does not support media description. Proceeding with standard analysis."
@@ -92,10 +98,12 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
9298
"Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.",
9399
e,
94100
)
101+
poller = None
95102

96-
if file_analyzed is False:
103+
if poller is None:
97104
poller = await document_intelligence_client.begin_analyze_document(
98-
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
105+
model_id=self.model_id,
106+
body=AnalyzeDocumentRequest(bytes_source=content_bytes),
99107
)
100108
analyze_result: AnalyzeResult = await poller.result()
101109

app/backend/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ quart-cors
44
openai>=1.109.1
55
tiktoken
66
tenacity
7-
azure-ai-documentintelligence==1.0.0b4
7+
azure-ai-documentintelligence==1.0.2
88
azure-cognitiveservices-speech
99
azure-cosmos
1010
azure-search-documents==11.7.0b2

app/backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

docs/data_ingestion.md

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ The chat app provides two ways to ingest data: manual ingestion and cloud ingest
1414
- [Indexing additional documents](#indexing-additional-documents)
1515
- [Removing documents](#removing-documents)
1616
- [Cloud ingestion](#cloud-ingestion)
17-
- [Custom skills pipeline](#custom-skills-pipeline)
17+
- [Enabling cloud ingestion](#enabling-cloud-ingestion)
18+
- [Indexer architecture](#indexer-architecture)
1819
- [Indexing of additional documents](#indexing-of-additional-documents)
1920
- [Removal of documents](#removal-of-documents)
2021
- [Scheduled indexing](#scheduled-indexing)
@@ -136,11 +137,33 @@ You can also remove individual documents by using the `--remove` flag. Open eith
136137

137138
This project includes an optional feature to perform data ingestion in the cloud using Azure Functions as custom skills for Azure AI Search indexers. This approach offloads the ingestion workload from your local machine to the cloud, allowing for more scalable and efficient processing of large datasets.
138139

139-
You must first explicitly [enable cloud ingestion](./deploy_features.md#enabling-cloud-ingestion) in the `azd` environment to use this feature.
140+
### Enabling cloud ingestion
140141

141-
This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks.
142+
1. If you've previously deployed, delete the existing search index or create a new index. This feature cannot be used on existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. Run this command to set a new index name:
142143

143-
### Custom skills pipeline
144+
```shell
145+
azd env set AZURE_SEARCH_INDEX cloudindex
146+
```
147+
148+
2. Run this command:
149+
150+
```shell
151+
azd env set USE_CLOUD_INGESTION true
152+
```
153+
154+
3. Open `azure.yaml` and un-comment the document-extractor, figure-processor, and text-processor sections. Those are the Azure Functions apps that will be deployed and serve as Azure AI Search skills.
155+
156+
4. Provision the new Azure Functions resources, deploy the function apps, and update the search indexer with:
157+
158+
```shell
159+
azd up
160+
```
161+
162+
5. That will upload the documents in the `data/` folder to the Blob storage container, create the indexer and skillset, and run the indexer to ingest the data. You can monitor the indexer status from the portal.
163+
164+
6. When you have new documents to ingest, you can upload documents to the Blob storage container and run the indexer from the Azure Portal to ingest new documents.
165+
166+
### Indexer architecture
144167

145168
The cloud ingestion pipeline uses four Azure Functions as custom skills within an Azure AI Search indexer. Each function corresponds to a stage in the ingestion process. Here's how it works:
146169

docs/deploy_features.md

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ You should typically enable these features before running `azd up`. Once you've
88
* [Using different embedding models](#using-different-embedding-models)
99
* [Enabling multimodal embeddings and answering](#enabling-multimodal-embeddings-and-answering)
1010
* [Enabling media description with Azure Content Understanding](#enabling-media-description-with-azure-content-understanding)
11+
* [Enabling cloud data ingestion](#enabling-cloud-data-ingestion)
1112
* [Enabling client-side chat history](#enabling-client-side-chat-history)
1213
* [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db)
1314
* [Enabling language picker](#enabling-language-picker)
@@ -256,6 +257,12 @@ first [remove the existing documents](./data_ingestion.md#removing-documents) an
256257
⚠️ This feature does not yet support DOCX, PPTX, or XLSX formats. If you have figures in those formats, they will be ignored.
257258
Convert them first to PDF or image formats to enable media description.
258259
260+
## Enabling cloud data ingestion
261+
262+
By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want to enable [cloud ingestion](./data_ingestion.md#cloud-ingestion), which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data.
263+
264+
Learn more in the [cloud ingestion guide](./data_ingestion.md#cloud-ingestion).
265+
259266
## Enabling client-side chat history
260267
261268
[📺 Watch: (RAG Deep Dive series) Storing chat history](https://www.youtube.com/watch?v=1YiTFnnLVIA)
@@ -322,36 +329,6 @@ Alternatively you can use the browser's built-in [Speech Synthesis API](https://
322329
azd env set USE_SPEECH_OUTPUT_BROWSER true
323330
```
324331
325-
## Enabling cloud data ingestion
326-
327-
By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want cloud ingestion, which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data.
328-
329-
To enable cloud ingestion:
330-
331-
1. If you've previously deployed, delete the existing search index or create a new index using:
332-
333-
```shell
334-
azd env set AZURE_SEARCH_INDEX cloudindex
335-
```
336-
337-
2. Run this command:
338-
339-
```shell
340-
azd env set USE_CLOUD_INGESTION true
341-
```
342-
343-
3. Open `azure.yaml` and un-comment the document-extractor, figure-processor, and text-processor sections. Those are the Azure Functions apps that will be deployed and serve as Azure AI Search skills.
344-
345-
4. Provision the new Azure Functions resources, deploy the function apps, and update the search indexer with:
346-
347-
```shell
348-
azd up
349-
```
350-
351-
5. That will upload the documents in the `data/` folder to the Blob storage container, create the indexer and skillset, and run the indexer to ingest the data. You can monitor the indexer status from the portal.
352-
353-
6. When you have new documents to ingest, you can upload documents to the Blob storage container and run the indexer from the Azure Portal to ingest new documents.
354-
355332
## Enabling authentication
356333
357334
By default, the deployed Azure web app will have no authentication or access restrictions enabled, meaning anyone with routable network access to the web app can chat with your indexed data. If you'd like to automatically setup authentication and user login as part of the `azd up` process, see [this guide](./login_and_acl.md).

tests/test_pdfparser.py

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
1111
from azure.ai.documentintelligence.models import (
12+
AnalyzeDocumentRequest,
1213
AnalyzeResult,
1314
BoundingRegion,
1415
DocumentCaption,
@@ -21,6 +22,7 @@
2122
from azure.core.credentials import AzureKeyCredential
2223
from azure.core.exceptions import HttpResponseError
2324
from PIL import Image, ImageChops
25+
from werkzeug.datastructures import FileStorage
2426

2527
from prepdocslib.figureprocessor import (
2628
FigureProcessor,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
178180
@pytest.mark.asyncio
179181
async def test_parse_simple(monkeypatch):
180182
mock_poller = MagicMock()
183+
captured_bodies: list[AnalyzeDocumentRequest] = []
181184

182-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
185+
async def mock_begin_analyze_document(self, model_id, **kwargs):
186+
body = kwargs["body"]
187+
captured_bodies.append(body)
183188
return mock_poller
184189

185190
async def mock_poller_result():
@@ -205,13 +210,106 @@ async def mock_poller_result():
205210
assert pages[0].page_num == 0
206211
assert pages[0].offset == 0
207212
assert pages[0].text == "Page content"
213+
assert len(captured_bodies) == 1
214+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
215+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
216+
217+
218+
@pytest.mark.asyncio
219+
async def test_parse_with_filestorage(monkeypatch):
220+
mock_poller = MagicMock()
221+
captured_bodies: list[AnalyzeDocumentRequest] = []
222+
223+
async def mock_begin_analyze_document(self, model_id, **kwargs):
224+
captured_bodies.append(kwargs["body"])
225+
return mock_poller
226+
227+
async def mock_poller_result():
228+
return AnalyzeResult(
229+
content="Page content",
230+
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
231+
tables=[],
232+
figures=[],
233+
)
234+
235+
monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
236+
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
237+
238+
parser = DocumentAnalysisParser(
239+
endpoint="https://example.com",
240+
credential=MockAzureCredential(),
241+
)
242+
stream = io.BytesIO(b"pdf content bytes")
243+
file_storage = FileStorage(stream=stream, filename="upload.pdf")
244+
file_storage.name = "upload.pdf"
245+
pages = [page async for page in parser.parse(file_storage)]
246+
247+
assert len(pages) == 1
248+
assert pages[0].page_num == 0
249+
assert pages[0].offset == 0
250+
assert pages[0].text == "Page content"
251+
assert len(captured_bodies) == 1
252+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
253+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
254+
255+
256+
@pytest.mark.asyncio
257+
async def test_parse_with_non_seekable_stream(monkeypatch):
258+
mock_poller = MagicMock()
259+
captured_bodies: list[AnalyzeDocumentRequest] = []
260+
261+
async def mock_begin_analyze_document(self, model_id, **kwargs):
262+
captured_bodies.append(kwargs["body"])
263+
return mock_poller
264+
265+
async def mock_poller_result():
266+
return AnalyzeResult(
267+
content="Page content",
268+
pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])],
269+
tables=[],
270+
figures=[],
271+
)
272+
273+
monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document)
274+
monkeypatch.setattr(mock_poller, "result", mock_poller_result)
275+
276+
class NonSeekableStream:
277+
def __init__(self, data: bytes, name: str):
278+
self._data = data
279+
self._name = name
280+
self._consumed = False
281+
282+
@property
283+
def name(self) -> str: # type: ignore[override]
284+
return self._name
285+
286+
def read(self) -> bytes:
287+
return self._data
288+
289+
parser = DocumentAnalysisParser(
290+
endpoint="https://example.com",
291+
credential=MockAzureCredential(),
292+
)
293+
294+
stream = NonSeekableStream(b"pdf content bytes", "nonseekable.pdf")
295+
pages = [page async for page in parser.parse(stream)]
296+
297+
assert len(pages) == 1
298+
assert pages[0].page_num == 0
299+
assert pages[0].offset == 0
300+
assert pages[0].text == "Page content"
301+
assert len(captured_bodies) == 1
302+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
303+
assert captured_bodies[0].bytes_source == b"pdf content bytes"
208304

209305

210306
@pytest.mark.asyncio
211307
async def test_parse_doc_with_tables(monkeypatch):
212308
mock_poller = MagicMock()
309+
captured_bodies: list[AnalyzeDocumentRequest] = []
213310

214-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
311+
async def mock_begin_analyze_document(self, model_id, **kwargs):
312+
captured_bodies.append(kwargs["body"])
215313
return mock_poller
216314

217315
async def mock_poller_result():
@@ -281,13 +379,17 @@ async def mock_poller_result():
281379
pages[0].text
282380
== "# Simple HTML Table\n\n\n<figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
283381
)
382+
assert len(captured_bodies) == 1
383+
assert isinstance(captured_bodies[0], AnalyzeDocumentRequest)
284384

285385

286386
@pytest.mark.asyncio
287387
async def test_parse_doc_with_figures(monkeypatch):
288388
mock_poller = MagicMock()
389+
captured_kwargs: list[dict] = []
289390

290-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
391+
async def mock_begin_analyze_document(self, model_id, **kwargs):
392+
captured_kwargs.append(kwargs)
291393
return mock_poller
292394

293395
async def mock_poller_result():
@@ -330,13 +432,20 @@ async def mock_poller_result():
330432
== '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n<figure id="1.1"></figure>\n\n\nThis is text after the figure that\'s not part of it.'
331433
)
332434
assert pages[0].images[0].placeholder == '<figure id="1.1"></figure>'
435+
assert len(captured_kwargs) == 1
436+
body = captured_kwargs[0]["body"]
437+
assert isinstance(body, AnalyzeDocumentRequest)
438+
assert captured_kwargs[0]["output"] == ["figures"]
439+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
333440

334441

335442
@pytest.mark.asyncio
336443
async def test_parse_unsupportedformat(monkeypatch, caplog):
337444
mock_poller = MagicMock()
445+
captured_kwargs: list[dict] = []
338446

339-
async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs):
447+
async def mock_begin_analyze_document(self, model_id, **kwargs):
448+
captured_kwargs.append(kwargs)
340449

341450
if kwargs.get("features") == ["ocrHighResolution"]:
342451

@@ -387,6 +496,11 @@ async def mock_poller_result():
387496
assert pages[0].page_num == 0
388497
assert pages[0].offset == 0
389498
assert pages[0].text == "Page content"
499+
assert len(captured_kwargs) == 2
500+
assert captured_kwargs[0]["features"] == ["ocrHighResolution"]
501+
assert isinstance(captured_kwargs[0]["body"], AnalyzeDocumentRequest)
502+
assert captured_kwargs[1].get("features") is None
503+
assert isinstance(captured_kwargs[1]["body"], AnalyzeDocumentRequest)
390504

391505

392506
@pytest.mark.asyncio

0 commit comments

Comments
 (0)