Update image processing work

BenConstable9 · BenConstable9 · commit 0b9370fa5ebc · 2025-01-16T18:47:18.000Z
diff --git a/image_processing/.env.example b/image_processing/.env.example
@@ -1,5 +1,5 @@
 OpenAI__Endpoint=<openAIEndpoint>
-OpenAI__CompletionDeployment=<openAIEmbeddingDeploymentId>
+OpenAI__MiniCompletionDeployment=<openAIEmbeddingDeploymentId>
 OpenAI__ApiVersion=<openAIApiVersion>
 AIService__DocumentIntelligence__Endpoint=<documentIntelligenceEndpoint>
 StorageAccount__Name=<Name of storage account if using identity based connections>
diff --git a/image_processing/README.md b/image_processing/README.md
@@ -1,6 +1,6 @@
 # AI Search Indexing with Azure Document Intelligence
 
-This portion of the repo contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt4o) to interpret and understand these.
+This portion of the repo contains code for linking Azure Document Intelligence with AI Search to process complex documents with charts and images, and uses multi-modal models (gpt-4o-mini) to interpret and understand these.
 
 The implementation in Python, although it can easily be adapted for C# or another language. The code is designed to run in an Azure Function App inside the tenant.
 
@@ -22,7 +22,7 @@ Instead of using OCR to extract the contents of the document, ADIv4 is used to a
 
 Once the Markdown is obtained, several steps are carried out:
 
-1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt4o in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart.
+1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt-4o-mini in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart.
 
 2. **Chunking**. The obtained content is chunked accordingly depending on the chunking strategy. This function app supports two chunking methods, **page wise** and **semantic chunking**. The page wise chunking is performed natively by Azure Document Intelligence. For a Semantic Chunking, we include a customer chunker that splits the text with the following strategy:
 
@@ -82,7 +82,7 @@ You can then test the chunking by sending a AI Search JSON format to the `/seman
 ### Deployment Steps
 
 1. Update `.env` file with the associated values. Not all values are required dependent on whether you are using System / User Assigned Identities or a Key based authentication. Use this template to update the environment variables in the function app.
-2. Make sure the infra and required identities are setup. This setup requires Azure Document Intelligence and GPT4o.
+2. Make sure the infra and required identities are setup. This setup requires Azure Document Intelligence and gpt-4o-mini.
 3. [Deploy your function app](https://learn.microsoft.com/en-us/azure/azure-functions/functions-deployment-technologies?tabs=windows) and test with a HTTP request.
 
 ### Code Files
diff --git a/image_processing/src/image_processing/figure_analysis.py b/image_processing/src/image_processing/figure_analysis.py
@@ -10,12 +10,35 @@
     APIError,
     APIStatusError,
     BadRequestError,
+    RateLimitError,
 )
-from tenacity import retry, stop_after_attempt, wait_exponential
+from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
 from layout_holders import FigureHolder
+from PIL import Image
+import io
+import base64
 
 
 class FigureAnalysis:
+    def get_image_size(self, figure: FigureHolder) -> tuple[int, int]:
+        """Get the size of the image from the binary data.
+
+        Parameters:
+        - figure (FigureHolder): The figure object containing the image data.
+
+        Returns:
+        - width (int): The width of the image.
+        - height (int): The height of the image."""
+        # Create a BytesIO object from the binary data
+        image_data = base64.b64decode(figure.data)
+        image_stream = io.BytesIO(image_data)
+
+        # Open the image using PIL
+        with Image.open(image_stream) as img:
+            # Get the size of the image
+            width, height = img.size
+            return width, height
+
     @retry(
         stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10)
     )
@@ -31,45 +54,86 @@ async def understand_image_with_gptv(self, figure: FigureHolder) -> dict:
         - img_description (str): The generated description for the image.
         """
 
+        # Open figure and check if below minimum size
+        width, height = self.get_image_size(figure)
+
+        if width < 75 and height < 75:
+            logging.info(
+                "Image is too small to be analysed. Width: %i, Height: %i",
+                width,
+                height,
+            )
+            figure.description = "Irrelevant Image"
+
+            return figure
+
         MAX_TOKENS = 2000
         api_version = os.environ["OpenAI__ApiVersion"]
-        model = os.environ["OpenAI__CompletionDeployment"]
+        model_name = "gpt-4o-mini"
+        deployment_id = os.environ["OpenAI__MiniCompletionDeployment"]
+        azure_endpoint = os.environ["OpenAI__Endpoint"]
 
         token_provider = get_bearer_token_provider(
             DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
         )
 
-        system_prompt = """You are an expert in technical image analysis. Your task is to provided analysis of images. You should FOCUS on what info can be inferred from the image and the meaning of the data inside the image. Draw actionable insights and conclusions from the image. Do not describe the image in a general way or describe the image in a way that is not useful for decision-making.
+        system_prompt = """You are an expert in technical image description and analysis for search and retrieval. Your task is to describe the key details, themes, and practical applications of the image, focusing on how the image could be used and what it helps the user achieve. Additionally, provide a brief explanation of what can be inferred from the image, such as trends, relationships, or insights.
+
+        It is essential to include all visible labels, data points, and annotations in your description. Use natural terms and phrases that users might search for to locate the image.
+
+        Charts and Graphs:
+            - Identify the type of chart and describe the data points, trends, and labels present.
+            - Explain how the chart can be used (e.g., for analyzing trends, tracking performance, or comparing metrics).
+            - Describe what can be inferred, such as patterns over time, correlations, or key insights from the data.
+
+        Maps:
+            - Highlight geographical features, landmarks, and any text labels or annotations, such as street names or distances.
+            - Explain how the map can be used (e.g., for navigation, travel planning, or understanding a region).
+            - Describe what can be inferred, such as proximity between locations, accessibility of areas, or regional layouts.
 
-        If the image is a chart for instance, you should describe the data trends, patterns, and insights that can be drawn from the chart. For example, you could describe the increase or decrease in sales over time, the peak sales period, or the sales performance of a particular product.
+        Diagrams:
+            - Describe the components, relationships, and purpose of the diagram.
+            - Explain how the diagram can be used (e.g., for understanding a process, visualizing a system, or explaining a concept).
+            - Describe what can be inferred, such as how components interact, dependencies, or the overall system structure.
 
-        If the image is a map, you should describe the geographical features, landmarks, and any other relevant information that can be inferred from the map.
+        Photographs or Logos:
+            - Return 'Irrelevant Image' if the image is not suitable for actionable purposes like analysis or decision-making e.g. a logo, a personal photo, or a generic landscape.
 
-        If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram.
 
-        Include any data points, labels, and other relevant information that can be inferred from the image.
+        Guidelines:
+            - Include all labels, text, and annotations to ensure a complete and accurate description.
+            - Clearly state both the potential use of the image and what insights or information can be inferred from it.
+            - Think about what the user might need from the image and describe it accordingly.
+            - Make sure to consider if the image will be useful for analysis later on. If nothing valuable for analysis, decision making or information retrieval, would be able to be inferred from the image, return 'Irrelevant Image'.
 
-        Provide a well-structured, detailed, and actionable analysis of the image. Focus on extracting data and information that can be inferred from the image.
+        Example:
+            Input:
+                - A bar chart showing monthly sales for 2024, with the x-axis labeled "Month" (January to December) and the y-axis labeled "Revenue in USD." The chart shows a steady increase from January to December, with a sharp spike in November.
+            Output:
+                - This bar chart shows monthly sales revenue for 2024, with the x-axis labeled 'Month' (January to December) and the y-axis labeled 'Revenue in USD.' It can be used to track sales performance over the year and identify periods of high or low revenue. From the chart, it can be inferred that sales steadily increased throughout the year, with a notable spike in November, possibly due to seasonal promotions or events.
 
-        IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'."""
+            Input:
+                - A photograph of a mountain landscape with snow-capped peaks, a winding river, and a dense forest in the foreground. The image captures the natural beauty of the region and the diverse ecosystems present.
+            Output:
+                - Irrelevant Image"""
 
-        user_input = "Perform technical analysis on this image. Provide a well-structured, description."
+        user_input = "Generate a description for the image provided that can be used for search purposes."
 
         if figure.caption is not None and len(figure.caption) > 0:
-            user_input += " (note: it has the following caption: {})".format(
-                figure.caption
-            )
+            user_input += f""" (note: it has the following caption: {
+                figure.caption})"""
 
         try:
             async with AsyncAzureOpenAI(
                 api_key=None,
                 api_version=api_version,
                 azure_ad_token_provider=token_provider,
-                azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
+                azure_endpoint=azure_endpoint,
+                azure_deployment=deployment_id,
             ) as client:
                 # We send both image caption and the image body to GPTv for better understanding
                 response = await client.chat.completions.create(
-                    model=model,
+                    model=model_name,
                     messages=[
                         {
                             "role": "system",
@@ -93,7 +157,13 @@ async def understand_image_with_gptv(self, figure: FigureHolder) -> dict:
                     ],
                     max_tokens=MAX_TOKENS,
                 )
-        except (OpenAIError, APIError, APIStatusError, BadRequestError) as e:
+        except (
+            OpenAIError,
+            APIError,
+            APIStatusError,
+            BadRequestError,
+            RateLimitError,
+        ) as e:
             logging.error(f"Failed to analyse image. Error: {e}")
 
             if "ResponsibleAIPolicyViolation" in e.message:
@@ -108,6 +178,10 @@ async def understand_image_with_gptv(self, figure: FigureHolder) -> dict:
 
             figure.description = response.choices[0].message.content
 
+            if len(figure.description) == 0:
+                logging.info("No description generated for image.")
+                figure.description = "Irrelevant Image"
+
             logging.info(f"Image Description: {figure.description}")
 
             return figure
@@ -128,20 +202,34 @@ async def analyse(self, record: dict) -> dict:
 
         try:
             updated_data = await self.understand_image_with_gptv(figure)
-            logging.info(f"Updated Data: {updated_data}")
-        except Exception as e:
+            logging.info(f"Updated Figure Data: {updated_data}")
+        except RetryError as e:
             logging.error(f"Failed to analyse image. Error: {e}")
             logging.error(f"Failed input: {record}")
-            return {
-                "recordId": record["recordId"],
-                "data": {},
-                "errors": [
-                    {
-                        "message": "Failed to analyse image. Pass a valid source in the request body.",
-                    }
-                ],
-                "warnings": None,
-            }
+            root_cause = e.last_attempt.exception()
+
+            if isinstance(root_cause, RateLimitError):
+                return {
+                    "recordId": record["recordId"],
+                    "data": None,
+                    "errors": [
+                        {
+                            "message": "Failed to analyse image due to rate limit error. Please try again later.",
+                        }
+                    ],
+                    "warnings": None,
+                }
+            else:
+                return {
+                    "recordId": record["recordId"],
+                    "data": None,
+                    "errors": [
+                        {
+                            "message": "Failed to analyse image. Check the logs for more details.",
+                        }
+                    ],
+                    "warnings": None,
+                }
         else:
             return {
                 "recordId": record["recordId"],
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
-# This code originates from: https://github.com/microsoft/dstoolkit-text2sql-and-imageprocessing
 
 import logging
 import os
@@ -32,7 +31,7 @@ class StorageAccountHelper:
     @property
     def account_url(self) -> str:
         """Get the account URL of the Azure Blob Storage."""
-        storage_account_name = os.environ.get("StorageAccount__Name")
+        storage_account_name = os.environ["StorageAccount__Name"]
         return f"https://{storage_account_name}.blob.core.windows.net"
 
     async def get_client(self):
@@ -42,7 +41,7 @@ async def get_client(self):
         return BlobServiceClient(account_url=self.account_url, credential=credential)
 
     async def add_metadata_to_blob(
-        self, source: str, container: str, metadata: dict
+        self, source: str, container: str, metadata: dict, upsert: bool = False
     ) -> None:
         """Add metadata to the blob.
 
@@ -51,14 +50,24 @@ async def add_metadata_to_blob(
             container (str): The container of the blob.
             metadata (dict): The metadata to add to the blob."""
 
-        blob = urllib.parse.unquote_plus(source)
+        logging.info("Adding Metadata")
+
+        blob = urllib.parse.unquote(source, encoding="utf-8")
 
         blob_service_client = await self.get_client()
         async with blob_service_client:
             async with blob_service_client.get_blob_client(
                 container=container, blob=blob
             ) as blob_client:
-                await blob_client.set_blob_metadata(metadata)
+                blob_properties = await blob_client.get_blob_properties()
+
+                if upsert:
+                    updated_metadata = blob_properties.metadata
+                    updated_metadata.update(metadata)
+                else:
+                    updated_metadata = metadata
+
+                await blob_client.set_blob_metadata(updated_metadata)
 
         logging.info("Metadata Added")
 
@@ -103,7 +112,7 @@ async def download_blob_to_temp_dir(
             container (str): The container of the blob.
             target_file_name (str): The target file name."""
 
-        blob = urllib.parse.unquote_plus(source)
+        blob = urllib.parse.unquote(source)
 
         blob_service_client = await self.get_client()
         async with blob_service_client:
@@ -254,11 +263,9 @@ async def process_figures_from_extracted_content(
                     )
 
                     logging.info(f"Figure Caption: {caption}")
-                    uri = "{}/{}/{}".format(
-                        storage_account_helper.account_url,
-                        self.images_container,
-                        blob,
-                    )
+
+                    uri = f"""{
+                        storage_account_helper.account_url}/{self.images_container}/{blob}"""
 
                     offset = figure.spans[0].offset - text_holder.page_offsets
 
@@ -414,7 +421,7 @@ async def analyse(self):
             logging.error(f"Failed to download the blob: {e}")
             return {
                 "recordId": self.record_id,
-                "data": {},
+                "data": None,
                 "errors": [
                     {
                         "message": f"Failed to download the blob. Check the source and try again. {e}",
@@ -430,9 +437,12 @@ async def analyse(self):
             logging.error(
                 "Failed to analyse %s with Azure Document Intelligence.", self.blob
             )
+            await storage_account_helper.add_metadata_to_blob(
+                self.blob, self.container, {"AzureSearch_Skip": "true"}, upsert=True
+            )
             return {
                 "recordId": self.record_id,
-                "data": {},
+                "data": None,
                 "errors": [
                     {
                         "message": f"Failed to analyze the document with Azure Document Intelligence. Check the logs and try again. {e}",
@@ -484,7 +494,7 @@ async def analyse(self):
             logging.error(f"Failed to process the extracted content: {e}")
             return {
                 "recordId": self.record_id,
-                "data": {},
+                "data": None,
                 "errors": [
                     {
                         "message": f"Failed to process the extracted content. Check the logs and try again. {e}",
@@ -536,7 +546,7 @@ async def process_layout_analysis(
     except KeyError:
         return {
             "recordId": record["recordId"],
-            "data": {},
+            "data": None,
             "errors": [
                 {
                     "message": "Failed to extract data with ADI. Pass a valid source in the request body.",