Azure-Samples
diff --git a/‎app/backend/prepdocs.py‎
Lines changed: 10 additions & 1 deletion b/‎app/backend/prepdocs.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎app/backend/prepdocslib/cu_image.py‎
Lines changed: 139 additions & 0 deletions b/‎app/backend/prepdocslib/cu_image.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎app/backend/prepdocslib/filestrategy.py‎
Lines changed: 9 additions & 0 deletions b/‎app/backend/prepdocslib/filestrategy.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎app/backend/prepdocslib/pdfparser.py‎
Lines changed: 51 additions & 30 deletions b/‎app/backend/prepdocslib/pdfparser.py‎
Lines changed: 51 additions & 30 deletions
diff --git a/‎app/backend/requirements.in‎
Lines changed: 1 addition & 1 deletion b/‎app/backend/requirements.in‎
Lines changed: 1 addition & 1 deletion
@@ -158,6 +158,8 @@ def setup_file_processors(
     local_pdf_parser: bool = False,
     local_html_parser: bool = False,
     search_images: bool = False,
+    use_content_understanding: bool = False,
+    content_understanding_endpoint: Union[str, None] = None,
 ):
     sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
 
@@ -170,6 +172,8 @@ def setup_file_processors(
         doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
             credential=documentintelligence_creds,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=content_understanding_endpoint,
         )
 
     pdf_parser: Optional[Parser] = None
@@ -298,14 +302,15 @@ async def main(strategy: Strategy, setup_index: bool = True):
         logging.basicConfig(format="%(message)s")
         # We only set the level to INFO for our logger,
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
-        logger.setLevel(logging.INFO)
+        logger.setLevel(logging.DEBUG)
 
     load_azd_env()
 
     use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
     use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
     use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
     dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
+    use_content_understanding = os.getenv("USE_CONTENT_UNDERSTANDING", "").lower() == "true"
 
     # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
     if tenant_id := os.getenv("AZURE_TENANT_ID"):
@@ -403,6 +408,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
             local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
             search_images=use_gptvision,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
         )
         image_embeddings_service = setup_image_embeddings_service(
             azure_credential=azd_credential,
@@ -421,6 +428,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
             use_acls=use_acls,
             category=args.category,
+            use_content_understanding=use_content_understanding,
+            content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
         )
 
     loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
 
@@ -0,0 +1,139 @@
+from typing import Union
+import logging
+
+import aiohttp
+from azure.core.credentials_async import AsyncTokenCredential
+from tenacity import retry, stop_after_attempt, wait_fixed
+from tenacity import retry_if_exception_type
+
+from azure.identity.aio import get_bearer_token_provider
+
+
+logger = logging.getLogger("scripts")
+
+CU_API_VERSION = "2024-12-01-preview"
+
+PATH_ANALYZER_MANAGEMENT = "/analyzers/{analyzerId}"
+PATH_ANALYZER_MANAGEMENT_OPERATION = "/analyzers/{analyzerId}/operations/{operationId}"
+
+# Define Analyzer inference paths
+PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
+PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
+
+analyzer_name = "image_schema_analyzer"
+image_schema = {
+    "analyzerId": analyzer_name,
+    "name": "Image understanding",
+    "description": "Extract detailed structured information from images extracted from documents.",
+    "baseAnalyzerId": "prebuilt-image",
+    "scenario": "image",
+    "config": {"returnDetails": False},
+    "fieldSchema": {
+        "name": "ImageInformation",
+        "descriptions": "Structured information from images.",
+        "fields": {
+            "Title": {
+                "type": "string",
+                "description": "Title for the image (either taken from the image directly or a good short title based off content)",
+            },
+            "ImageType": {
+                "type": "string",
+                "description": "The type of image.",
+                "kind": "classify",
+                "enum": [
+                    "chart",
+                    "diagram",
+                    "table",
+                    "figure",
+                    "photo",
+                    "screenshot",
+                    "logo",
+                    "icon",
+                    "map",
+                    "infographic",
+                    "other",
+                ],
+            },
+            "MarkdownDescription": {
+                "type": "string",
+                "description": "Description of the image in markdown format. Start with a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in tabular markdown format, with valid syntax and accurate numbers. If the image is a chart, describe any axis or legends.",
+            },
+        },
+    },
+}
+
+
+class ContentUnderstandingManager:
+
+    def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]):
+        self.endpoint = endpoint
+        self.credential = credential
+
+    async def create_analyzer(self):
+
+        token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
+        token = await token_provider()
+        headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+        params = {"api-version": CU_API_VERSION}
+        analyzer_id = image_schema["analyzerId"]
+        cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}"
+        async with aiohttp.ClientSession() as session:
+            async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response:
+                if response.status == 409:
+                    print(f"Analyzer '{analyzer_id}' already exists.")
+                    return
+                elif response.status != 201:
+                    data = await response.text()
+                    # TODO: log it
+                    print(data)
+                    response.raise_for_status()
+                else:
+                    poll_url = response.headers.get("Operation-Location")
+
+            @retry(stop=stop_after_attempt(60), wait=wait_fixed(2))
+            async def poll():
+                async with session.get(poll_url, headers=headers) as response:
+                    response.raise_for_status()
+                    response_json = await response.json()
+                    if response_json["status"] != "Succeeded":
+                        raise ValueError("Retry")
+                    print(response_json)
+
+            await poll()
+
+    def run_cu_image(self, analyzer_name, image):
+        result = self.run_inference(analyzer_name, image)
+        model_output = result["result"]["contents"][0]["fields"]
+        model_output_raw = str(model_output)
+        return model_output, model_output_raw
+
+    async def run_cu_image(self, image_bytes):
+        async with aiohttp.ClientSession() as session:
+            token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
+            headers = {"Authorization": "Bearer " + token.token}
+            params = {"api-version": CU_API_VERSION}
+
+            async with session.post(
+                url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze",
+                params=params,
+                headers=headers,
+                data=image_bytes,
+            ) as response:
+                result = await response.json()
+                print(result)
+                poll_url = response.headers["Operation-Location"]
+
+                @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
+                async def poll():
+                    async with session.get(poll_url, headers=headers) as response:
+                        response.raise_for_status()
+                        response_json = await response.json()
+                        print(response_json)
+                        if response_json["status"] == "Failed":
+                            raise Exception("Failed")
+                        if response_json["status"] == "Running":
+                            raise ValueError("Running")
+                        return response_json
+
+                response = await poll()
+                return response["result"]["contents"][0]["fields"]
@@ -7,6 +7,7 @@
 from .listfilestrategy import File, ListFileStrategy
 from .searchmanager import SearchManager, Section
 from .strategy import DocumentAction, SearchInfo, Strategy
+from .cu_image import ContentUnderstandingManager
 
 logger = logging.getLogger("scripts")
 
@@ -50,6 +51,8 @@ def __init__(
         search_analyzer_name: Optional[str] = None,
         use_acls: bool = False,
         category: Optional[str] = None,
+        use_content_understanding: bool = False,
+        content_understanding_endpoint: Optional[str] = None,
     ):
         self.list_file_strategy = list_file_strategy
         self.blob_manager = blob_manager
@@ -61,6 +64,8 @@ def __init__(
         self.search_info = search_info
         self.use_acls = use_acls
         self.category = category
+        self.use_content_understanding = use_content_understanding
+        self.content_understanding_endpoint = content_understanding_endpoint
 
     async def setup(self):
         search_manager = SearchManager(
@@ -73,6 +78,10 @@ async def setup(self):
         )
         await search_manager.create_index()
 
+        if self.use_content_understanding:
+            cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.search_info.credential)
+            await cu_manager.create_analyzer()
+
     async def run(self):
         search_manager = SearchManager(
             self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
 
@@ -6,6 +6,7 @@
 
 import pymupdf
 from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
 from azure.ai.documentintelligence.models import DocumentTable
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
@@ -14,6 +15,7 @@
 
 from .page import Page
 from .parser import Parser
+from .cu_image import ContentUnderstandingManager
 
 logger = logging.getLogger("scripts")
 
@@ -48,24 +50,28 @@ def __init__(
         credential: Union[AsyncTokenCredential, AzureKeyCredential],
         model_id="prebuilt-layout",
         use_content_understanding=True,
+        content_understanding_endpoint: str = None,
     ):
         self.model_id = model_id
         self.endpoint = endpoint
         self.credential = credential
         self.use_content_understanding = use_content_understanding
+        self.content_understanding_endpoint = content_understanding_endpoint
 
     async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
         logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
 
-        # TODO: do we also need output=figures on the client itself? seems odd.
+        cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.credential)
         async with DocumentIntelligenceClient(
-            endpoint=self.endpoint, credential=self.credential, output="figures"
+            endpoint=self.endpoint, credential=self.credential
         ) as document_intelligence_client:
+            # turn content into bytes
+            content_bytes = content.read()
             if self.use_content_understanding:
                 poller = await document_intelligence_client.begin_analyze_document(
                     model_id="prebuilt-layout",
-                    analyze_request=content,
-                    content_type="application/octet-stream",
+                    analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
+                    # content_type="application/octet-stream",
                     output=["figures"],
                     features=["ocrHighResolution"],
                     output_content_format="markdown",
@@ -109,7 +115,9 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                 yield Page(page_num=page_num, offset=offset, text=page_text)
                 offset += len(page_text)
 
+            figure_results = {}
             if form_recognizer_results.figures:
+                doc = pymupdf.open(stream=io.BytesIO(content_bytes))
                 for figures_idx, figure in enumerate(form_recognizer_results.figures):
                     for region in figure.bounding_regions:
                         print(f"\tFigure body bounding regions: {region}")
@@ -121,28 +129,44 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                             region.polygon[5],  # y1 (bottom)
                         )
                     page_number = figure.bounding_regions[0]["pageNumber"]
-                    cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(
-                        content, page_number - 1, bounding_box
-                    )
-
-                    os.makedirs("figures", exist_ok=True)
-
-                    filename = "figure_imagecrop" + str(figures_idx) + ".png"
-                    # Full path for the file
-                    filepath = os.path.join("figures", filename)
+                    cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
 
                     # Save the figure
-                    cropped_img.save(filepath)
                     bytes_io = io.BytesIO()
                     cropped_img.save(bytes_io, format="PNG")
-                    cropped_img = bytes_io.getvalue()
-                    # _ , figure_description = run_cu_image(analyzer_name, filepath)
-
-                    # md_content = replace_figure_description(md_content, figure_description, figures_idx+1)
-                    # figure_content.append(figure_description)
-
-    @classmethod
-    def table_to_html(cls, table: DocumentTable):
+                    image_fields = await cu_manager.run_cu_image(bytes_io.getvalue())
+                    figure_results[figure.id] = image_fields
+
+        md_content = analyze_result.content
+        page_to_figure = {}
+        for figure in analyze_result.figures:
+            # Parse figure id
+            # https://learn.microsoft.com/azure/ai-services/document-intelligence/concept/analyze-document-response?view=doc-intel-4.0.0#figures
+            figure_id = figure.id.split(".")  # 3.1 where 3 is the page number and 1 is the figure number, 1-indexed
+            page = int(figure_id[0])
+            if page not in page_to_figure:
+                page_to_figure[page] = []
+            page_to_figure[page].append(figure.id)
+        for page in form_recognizer_results.pages:
+            # Use the text span to extract the markdown on the page
+            span = page.spans[0]
+            page_md_content = md_content[span.offset : span.offset + span.length]
+            if page.page_number in page_to_figure:
+                page_figures = page_to_figure[page.page_number]
+                # split the content on the figure tag
+                parts = page_md_content.split("\n<figure>\n")
+                for i, figure_id in enumerate(page_figures):
+                    with open(
+                        os.path.join(figures_directory, f"figure_imagecrop_{figure_id}_verbalized.json"), "r"
+                    ) as f:
+                        figure_content = json.dumps(json.load(f)["result"]["contents"][0])
+                        parts[i] = parts[i] + f'<!-- FigureContent="{figure_content}" -->'
+                    page_md_content = "\n".join(parts)
+            with open(os.path.join(pages_md_directory, f"page_{page.page_number}.md"), "w", encoding="utf-8") as f:
+                f.write(page_md_content)
+
+    @staticmethod
+    def table_to_html(table: DocumentTable):
         table_html = "<table>"
         rows = [
             sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
@@ -162,8 +186,8 @@ def table_to_html(cls, table: DocumentTable):
         table_html += "</table>"
         return table_html
 
-    @classmethod
-    def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
+    @staticmethod
+    def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box):
         """
         Crops a region from a given page in a PDF and returns it as an image.
 
@@ -172,16 +196,13 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
         :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
         :return: A PIL Image of the cropped area.
         """
-        doc = pymupdf.open(pdf_path)
+        logger.info(f"Cropping image from PDF page {page_number} with bounding box {bounding_box}")
         page = doc.load_page(page_number)
 
         # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
         bbx = [x * 72 for x in bounding_box]
         rect = pymupdf.Rect(bbx)
+        # 72 is the DPI ? what? explain this from CU
         pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
 
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-
-        doc.close()
-
-        return img
+        return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -5,7 +5,7 @@ openai>=1.3.7
 numpy>=1,<2.1.0 # Used by openai embeddings.create to optimize embeddings (but not required)
 tiktoken
 tenacity
-azure-ai-documentintelligence
+azure-ai-documentintelligence==1.0.0b4
 azure-cognitiveservices-speech
 azure-cosmos
 azure-search-documents==11.6.0b6