Make mypy happy

pamelafox · pamelafox · commit 0681755b1544 · 2024-12-04T15:26:36.000-08:00
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
@@ -2,10 +2,10 @@
 from typing import List, Optional
 
 from .blobmanager import BlobManager
-from .cu_image import ContentUnderstandingManager
 from .embeddings import ImageEmbeddings, OpenAIEmbeddings
 from .fileprocessor import FileProcessor
 from .listfilestrategy import File, ListFileStrategy
+from .mediadescriber import ContentUnderstandingDescriber
 from .searchmanager import SearchManager, Section
 from .strategy import DocumentAction, SearchInfo, Strategy
 
@@ -79,7 +79,7 @@ async def setup(self):
         await search_manager.create_index()
 
         if self.use_content_understanding:
-            cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.search_info.credential)
+            cu_manager = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.search_info.credential)
             await cu_manager.create_analyzer()
 
     async def run(self):
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Union
+from abc import ABC
 
 import aiohttp
 from azure.core.credentials_async import AsyncTokenCredential
@@ -9,39 +9,36 @@
 
 logger = logging.getLogger("scripts")
 
-CU_API_VERSION = "2024-12-01-preview"
-
-PATH_ANALYZER_MANAGEMENT = "/analyzers/{analyzerId}"
-PATH_ANALYZER_MANAGEMENT_OPERATION = "/analyzers/{analyzerId}/operations/{operationId}"
-
-# Define Analyzer inference paths
-PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
-PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
-
-analyzer_name = "image_analyzer"
-image_schema = {
-    "analyzerId": analyzer_name,
-    "name": "Image understanding",
-    "description": "Extract detailed structured information from images extracted from documents.",
-    "baseAnalyzerId": "prebuilt-image",
-    "scenario": "image",
-    "config": {"returnDetails": False},
-    "fieldSchema": {
-        "name": "ImageInformation",
-        "descriptions": "Description of image.",
-        "fields": {
-            "Description": {
-                "type": "string",
-                "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
-            },
-        },
-    },
-}
 
+class MediaDescriber(ABC):
 
-class ContentUnderstandingManager:
+    async def describe_image(self, image_bytes) -> str:
+        raise NotImplementedError
+
+
+class ContentUnderstandingDescriber:
+    CU_API_VERSION = "2024-12-01-preview"
+
+    analyzer_schema = {
+        "analyzerId": "image_analyzer",
+        "name": "Image understanding",
+        "description": "Extract detailed structured information from images extracted from documents.",
+        "baseAnalyzerId": "prebuilt-image",
+        "scenario": "image",
+        "config": {"returnDetails": False},
+        "fieldSchema": {
+            "name": "ImageInformation",
+            "descriptions": "Description of image.",
+            "fields": {
+                "Description": {
+                    "type": "string",
+                    "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
+                },
+            },
+        },
+    }
 
-    def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]):
+    def __init__(self, endpoint: str, credential: AsyncTokenCredential):
         self.endpoint = endpoint
         self.credential = credential
 
@@ -61,16 +58,18 @@ async def poll():
         return await poll()
 
     async def create_analyzer(self):
-        logger.info("Creating analyzer '%s'...", image_schema["analyzerId"])
+        logger.info("Creating analyzer '%s'...", self.analyzer_schema["analyzerId"])
 
         token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
         token = await token_provider()
         headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
-        params = {"api-version": CU_API_VERSION}
-        analyzer_id = image_schema["analyzerId"]
+        params = {"api-version": self.CU_API_VERSION}
+        analyzer_id = self.analyzer_schema["analyzerId"]
         cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}"
         async with aiohttp.ClientSession() as session:
-            async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response:
+            async with session.put(
+                url=cu_endpoint, params=params, headers=headers, json=self.analyzer_schema
+            ) as response:
                 if response.status == 409:
                     logger.info("Analyzer '%s' already exists.", analyzer_id)
                     return
@@ -90,8 +89,8 @@ async def describe_image(self, image_bytes) -> str:
         async with aiohttp.ClientSession() as session:
             token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
             headers = {"Authorization": "Bearer " + token.token}
-            params = {"api-version": CU_API_VERSION}
-
+            params = {"api-version": self.CU_API_VERSION}
+            analyzer_name = self.analyzer_schema["analyzerId"]
             async with session.post(
                 url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze",
                 params=params,
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -18,7 +18,7 @@
 from PIL import Image
 from pypdf import PdfReader
 
-from .cu_image import ContentUnderstandingManager
+from .mediadescriber import ContentUnderstandingDescriber
 from .page import Page
 from .parser import Parser
 
@@ -55,7 +55,7 @@ def __init__(
         credential: Union[AsyncTokenCredential, AzureKeyCredential],
         model_id="prebuilt-layout",
         use_content_understanding=True,
-        content_understanding_endpoint: str = None,
+        content_understanding_endpoint: Union[str, None] = None,
     ):
         self.model_id = model_id
         self.endpoint = endpoint
@@ -66,13 +66,19 @@ def __init__(
     async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
         logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
 
-        cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.credential)
         async with DocumentIntelligenceClient(
             endpoint=self.endpoint, credential=self.credential
         ) as document_intelligence_client:
             # turn content into bytes
             content_bytes = content.read()
             if self.use_content_understanding:
+                if self.content_understanding_endpoint is None:
+                    raise ValueError("content_understanding_endpoint should not be None")
+                if isinstance(self.credential, AzureKeyCredential):
+                    raise ValueError(
+                        "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
+                    )
+                cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
                 poller = await document_intelligence_client.begin_analyze_document(
                     model_id="prebuilt-layout",
                     analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
@@ -111,7 +117,7 @@ class ObjectType(Enum):
                 # mark all positions of the table spans in the page
                 page_offset = page.spans[0].offset
                 page_length = page.spans[0].length
-                mask_chars = [(ObjectType.NONE, None)] * page_length
+                mask_chars: list[tuple[ObjectType, Union[int, None]]] = [(ObjectType.NONE, None)] * page_length
                 for table_idx, table in enumerate(tables_on_page):
                     for span in table.spans:
                         # replace all table spans with "table_id" in table_chars array
@@ -132,16 +138,20 @@ class ObjectType(Enum):
                 added_objects = set()  # set of object types todo mypy
                 for idx, mask_char in enumerate(mask_chars):
                     object_type, object_idx = mask_char
+                    if object_idx is None:
+                        raise ValueError("object_idx should not be None")
                     if object_type == ObjectType.NONE:
                         page_text += form_recognizer_results.content[page_offset + idx]
                     elif object_type == ObjectType.TABLE:
                         if mask_char not in added_objects:
                             page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx])
                             added_objects.add(mask_char)
                     elif object_type == ObjectType.FIGURE:
+                        if cu_describer is None:
+                            raise ValueError("cu_describer should not be None, unable to describe figure")
                         if mask_char not in added_objects:
                             figure_html = await DocumentAnalysisParser.figure_to_html(
-                                doc_for_pymupdf, cu_manager, figures_on_page[object_idx]
+                                doc_for_pymupdf, cu_describer, figures_on_page[object_idx]
                             )
                             page_text += figure_html
                             added_objects.add(mask_char)
@@ -163,9 +173,12 @@ class ObjectType(Enum):
 
     @staticmethod
     async def figure_to_html(
-        doc: pymupdf.Document, cu_manager: ContentUnderstandingManager, figure: DocumentFigure
+        doc: pymupdf.Document, cu_describer: ContentUnderstandingDescriber, figure: DocumentFigure
     ) -> str:
-        logger.info("Describing figure '%s'", figure.id)
+        figure_title = (figure.caption and figure.caption.content) or ""
+        logger.info("Describing figure '%s' with title", figure.id, figure_title)
+        if not figure.bounding_regions:
+            return f"<figure><figcaption>{figure_title}</figcaption></figure>"
         for region in figure.bounding_regions:
             # To learn more about bounding regions, see https://aka.ms/bounding-region
             bounding_box = (
@@ -176,8 +189,7 @@ async def figure_to_html(
             )
         page_number = figure.bounding_regions[0]["pageNumber"]  # 1-indexed
         cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
-        figure_description = await cu_manager.describe_image(cropped_img)
-        figure_title = (figure.caption and figure.caption.content) or ""
+        figure_description = await cu_describer.describe_image(cropped_img)
         return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
 
     @staticmethod
@@ -221,7 +233,7 @@ def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box) -
         # The matrix is used to convert between these 2 units
         pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
 
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
         bytes_io = io.BytesIO()
         img.save(bytes_io, format="PNG")
         return bytes_io.getvalue()