diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 1fcbbc9531..a40e69b73e 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -14,6 +14,7 @@ ) from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential +from azure.core.exceptions import HttpResponseError from PIL import Image from pypdf import PdfReader @@ -68,6 +69,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: async with DocumentIntelligenceClient( endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: + file_analyzed = False if self.use_content_understanding: if self.content_understanding_endpoint is None: raise ValueError("Content Understanding is enabled but no endpoint was provided") @@ -77,15 +79,29 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: ) cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) content_bytes = content.read() - poller = await document_intelligence_client.begin_analyze_document( - model_id="prebuilt-layout", - analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes), - output=["figures"], - features=["ocrHighResolution"], - output_content_format="markdown", - ) - doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) - else: + try: + poller = await document_intelligence_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes), + output=["figures"], + features=["ocrHighResolution"], + output_content_format="markdown", + ) + doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) + file_analyzed = True + except HttpResponseError as e: + content.seek(0) + if e.error and e.error.code == "InvalidArgument": + logger.error( + "This document type does not support media description. Proceeding with standard analysis." + ) + else: + logger.error( + "Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.", + e, + ) + + if file_analyzed is False: poller = await document_intelligence_client.begin_analyze_document( model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" ) diff --git a/docs/deploy_features.md b/docs/deploy_features.md index ea0c7e8288..7c5f2a4038 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -163,7 +163,6 @@ By default, if your documents contain image-like figures, the data ingestion pro so users will not be able to ask questions about them. You can optionably enable the description of media content using Azure Content Understanding. When enabled, the data ingestion process will send figures to Azure Content Understanding and replace the figure with the description in the indexed document. -To learn more about this process and compare it to the gpt-4 vision integration, see [this guide](./data_ingestion.md#media-description). To enable media description with Azure Content Understanding, run: @@ -175,6 +174,9 @@ If you have already run `azd up`, you will need to run `azd provision` to create If you have already indexed your documents and want to re-index them with the media descriptions, first [remove the existing documents](./data_ingestion.md#removing-documents) and then [re-ingest the data](./data_ingestion.md#indexing-additional-documents). +⚠️ This feature does not yet support DOCX, PPTX, or XLSX formats. If you have figures in those formats, they will be ignored. +Convert them first to PDF or image formats to enable media description. + ## Enabling client-side chat history This feature allows users to view the chat history of their conversation, stored in the browser using [IndexedDB](https://developer.mozilla.org/docs/Web/API/IndexedDB_API). That means the chat history will be available only on the device where the chat was initiated. To enable browser-stored chat history, run: diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index 408aa2b2d0..2f21fe6358 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -1,8 +1,9 @@ import io +import json import logging import math import pathlib -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import AsyncMock, MagicMock, Mock import pymupdf import pytest @@ -17,6 +18,7 @@ DocumentTable, DocumentTableCell, ) +from azure.core.exceptions import HttpResponseError from PIL import Image, ImageChops from prepdocslib.mediadescriber import ContentUnderstandingDescriber @@ -308,3 +310,63 @@ async def mock_describe_image(self, image_bytes): pages[0].text == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." ) + + +@pytest.mark.asyncio +async def test_parse_unsupportedformat(monkeypatch, caplog): + mock_poller = MagicMock() + + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + + if kwargs.get("features") == ["ocrHighResolution"]: + + class FakeErrorOne: + def __init__(self): + self.error = Mock(message="A fake error", code="FakeErrorOne") + + class FakeHttpResponse(HttpResponseError): + def __init__(self, response, error, *args, **kwargs): + self.error = error + super().__init__(self, response=response, *args, **kwargs) + + message = { + "error": { + "code": "InvalidArgument", + "message": "A fake error", + } + } + response = Mock(status_code=500, headers={}) + response.text = lambda encoding=None: json.dumps(message).encode("utf-8") + response.headers["content-type"] = "application/json" + response.content_type = "application/json" + raise FakeHttpResponse(response, FakeErrorOne()) + else: + return mock_poller + + async def mock_poller_result(): + return AnalyzeResult( + content="Page content", + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])], + tables=[], + figures=[], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + parser = DocumentAnalysisParser( + endpoint="https://example.com", + credential=MockAzureCredential(), + use_content_understanding=True, + content_understanding_endpoint="https://example.com", + ) + content = io.BytesIO(b"pdf content bytes") + content.name = "test.docx" + with caplog.at_level(logging.ERROR): + pages = [page async for page in parser.parse(content)] + assert "This document type does not support media description." in caplog.text + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "Page content"