More prepdocs improvements for image handling

pamelafox · pamelafox · commit ea3ee284f549 · 2025-06-03T06:55:48.000Z
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -56,16 +56,21 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
             if file.url is None:
                 with open(file.content.name, "rb") as reopened_file:
                     blob_name = BlobManager.blob_name_from_file_name(file.content.name)
-                    logger.info("Uploading blob for whole file -> %s", blob_name)
+                    logger.info("Uploading blob for document %s", blob_name)
                     blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
                     file.url = blob_client.url
-
-            #if self.store_page_images:
-            #    if os.path.splitext(file.content.name)[1].lower() == ".pdf":
-            #        return await self.upload_pdf_blob_images(service_client, container_client, file)
-            #    else:
-            #        logger.info("File %s is not a PDF, skipping image upload", file.content.name)
-
+        return None
+    
+    async def upload_document_image(self, document_file: File, image_bytes: bytes, image_filename: str) -> Optional[str]:
+        async with BlobServiceClient(
+            account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
+        ) as service_client, service_client.get_container_client(self.container) as container_client:
+            if not await container_client.exists():
+                await container_client.create_container()
+            blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename
+            logger.info("Uploading blob for document image %s", blob_name)
+            blob_client = await container_client.upload_blob(blob_name, io.BytesIO(image_bytes), overwrite=True)
+            return blob_client.url
         return None
 
     def get_managedidentity_connectionstring(self):
diff --git a/app/backend/prepdocslib/embeddings.py b/app/backend/prepdocslib/embeddings.py
@@ -236,28 +236,24 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]):
         self.token_provider = token_provider
         self.endpoint = endpoint
 
-    async def create_embeddings(self, blob_urls: list[str]) -> list[list[float]]:
+    async def create_embedding(self, image_bytes: bytes) -> list[float]:
         endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage")
-        headers = {"Content-Type": "application/json"}
         params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
-        headers["Authorization"] = "Bearer " + await self.token_provider()
+        headers = {"Authorization": "Bearer " + await self.token_provider()}
 
-        embeddings: list[list[float]] = []
         async with aiohttp.ClientSession(headers=headers) as session:
-            for blob_url in blob_urls:
-                async for attempt in AsyncRetrying(
-                    retry=retry_if_exception_type(Exception),
+            async for attempt in AsyncRetrying(
+                retry=retry_if_exception_type(Exception),
                     wait=wait_random_exponential(min=15, max=60),
                     stop=stop_after_attempt(15),
                     before_sleep=self.before_retry_sleep,
                 ):
-                    with attempt:
-                        body = {"url": blob_url}
-                        async with session.post(url=endpoint, params=params, json=body) as resp:
-                            resp_json = await resp.json()
-                            embeddings.append(resp_json["vector"])
-
-        return embeddings
+                with attempt:
+                    async with session.post(url=endpoint, params=params, data=image_bytes) as resp:
+                        resp_json = await resp.json()
+                        return resp_json["vector"]
+                    
+        return []
 
     def before_retry_sleep(self, retry_state):
         logger.info("Rate limited on the Vision embeddings API, sleeping before retrying...")
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
@@ -18,7 +18,8 @@ async def parse_file(
     file: File,
     file_processors: dict[str, FileProcessor],
     category: Optional[str] = None,
-    image_embeddings: Optional[ImageEmbeddings] = None,
+    blob_manager: Optional[BlobManager] = None,
+    image_embeddings_client: Optional[ImageEmbeddings] = None,
 ) -> list[Section]:
     key = file.file_extension().lower()
     processor = file_processors.get(key)
@@ -27,12 +28,24 @@ async def parse_file(
         return []
     logger.info("Ingesting '%s'", file.filename())
     pages = [page async for page in processor.parser.parse(content=file.content)]
+    for page in pages:
+        for image in page.images:
+            if image.url is None:
+                image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename)
+            if image_embeddings_client:
+                image.embedding = await image_embeddings_client.create_embedding(image.bytes)
     logger.info("Splitting '%s' into sections", file.filename())
-    if image_embeddings:
-        logger.warning("Each page will be split into smaller chunks of text, but images will be of the entire page.")
     sections = [
         Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
     ]
+    # For now, add the images back to each split page based off split_page.page_num
+    for section in sections:
+        section.split_page.images = [
+            image for page in pages if page.page_num == section.split_page.page_num for image in page.images
+        ]
+        logger.info(
+            "Section for page %d has %d images", section.split_page.page_num, len(section.split_page.images)
+        )
     return sections
 
 
@@ -102,13 +115,9 @@ async def run(self):
             files = self.list_file_strategy.list()
             async for file in files:
                 try:
-                    sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
+                    sections = await parse_file(file, self.file_processors, self.category, self.blob_manager, self.image_embeddings)
                     if sections:
-                        blob_sas_uris = await self.blob_manager.upload_blob(file)
-                        blob_image_embeddings: Optional[list[list[float]]] = None
-                        if self.image_embeddings and blob_sas_uris:
-                            blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
-                        await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
+                        await self.search_manager.update_content(sections, url=file.url)
                 finally:
                     if file:
                         file.close()
diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json
@@ -5,5 +5,12 @@
  "oids": [],
  "groups": [],
  "images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields
-   [ {embedding, url, verbalization, boundingbox},
-     {embedding, url, verbalization, boundingbox} ]
+   [ {embedding, url, description, boundingbox},
+     {embedding, url, description, boundingbox} ]
+
+# Consider gpt-4.1-mini as default: pricier? but relatively not pricey compared to o3 and gpt-4o. run our evals. its better at instruction following.
+
+# Parse each page, get back text with descritpions, associate each page with images on that page
+# Each image needs the citation file.pdf#figure=1 via Pillow
+# Each image needs to be stored in Blob storage
+# Update the search index with all the info
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -86,7 +86,6 @@ async def create_analyzer(self):
                 await self.poll_api(session, poll_url, headers)
 
     async def describe_image(self, image_bytes: bytes) -> str:
-        logger.info("Sending image to Azure Content Understanding service...")
         async with aiohttp.ClientSession() as session:
             token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
             headers = {"Authorization": "Bearer " + token.token}
@@ -115,7 +114,6 @@ def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
         self.deployment = deployment
         
     async def describe_image(self, image_bytes: bytes) -> str:
-        logger.info("Describing image using LLM...")
         image_base64 = base64.b64encode(image_bytes).decode("utf-8")
         image_datauri = f"data:image/png;base64,{image_base64}"
 
@@ -131,10 +129,9 @@ async def describe_image(self, image_bytes: bytes) -> str:
                     "role": "user",
                     "content": 
                     [{"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.", "type": "text"},
-                    {"image_url": {"url": image_datauri}, "type": "image_url", "detail": "low"}]
+                    {"image_url": {"url": image_datauri}, "type": "image_url", "detail": "auto"}]
                 }
             ])
         description = response.choices[0].message.content.strip() if response.choices else ""
-        print(description)
         return description
 
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
@@ -1,3 +1,17 @@
+from typing import Sequence
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ImageOnPage:
+    bytes: bytes
+    bbox: tuple[float, float, float, float]
+    filename: str
+    description: str
+    url: str | None = None
+    embedding: list[float] | None = None
+
+@dataclass
 class Page:
     """
     A single page from a document
@@ -7,13 +21,12 @@ class Page:
         offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
         text (str): The text of the page
     """
+    page_num: int
+    offset: int
+    text: str
+    images: list[ImageOnPage] = field(default_factory=list)
 
-    def __init__(self, page_num: int, offset: int, text: str):
-        self.page_num = page_num
-        self.offset = offset
-        self.text = text
-
-
+@dataclass
 class SplitPage:
     """
     A section of a page that has been split into a smaller chunk.
@@ -22,7 +35,6 @@ class SplitPage:
         page_num (int): Page number (0-indexed)
         text (str): The text of the section
     """
-
-    def __init__(self, page_num: int, text: str):
-        self.page_num = page_num
-        self.text = text
+    page_num: int
+    text: str
+    images: list[ImageOnPage] = field(default_factory=list)
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -21,7 +21,7 @@
 from openai import AsyncOpenAI
 
 from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber
-from .page import Page
+from .page import Page, ImageOnPage
 from .parser import Parser
 
 logger = logging.getLogger("scripts")
@@ -50,6 +50,8 @@ class MediaDescriptionStrategy(Enum):
     OPENAI = "openai"
     CONTENTUNDERSTANDING = "content_understanding"
 
+
+
 class DocumentAnalysisParser(Parser):
     """
     Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -68,6 +70,7 @@ def __init__(
         openai_deployment: Optional[str] = None,
         # If using Content Understanding, this is the endpoint for the service
         content_understanding_endpoint: Union[str, None] = None,
+        # should this take the blob storage info too?
     ):
         self.model_id = model_id
         self.endpoint = endpoint
@@ -137,6 +140,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
             analyze_result: AnalyzeResult = await poller.result()
 
             offset = 0
+            
             for page in analyze_result.pages:
                 tables_on_page = [
                     table
@@ -150,6 +154,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                         for figure in (analyze_result.figures or [])
                         if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
                     ]
+                page_images: list[ImageOnPage] = []
 
                 class ObjectType(Enum):
                     NONE = -1
@@ -195,24 +200,25 @@ class ObjectType(Enum):
                         if object_idx is None:
                             raise ValueError("Expected object_idx to be set")
                         if mask_char not in added_objects:
-                            figure_html = await DocumentAnalysisParser.figure_to_html(
+                            image_on_page = await DocumentAnalysisParser.process_figure(
                                 doc_for_pymupdf, figures_on_page[object_idx], media_describer
                             )
-                            page_text += figure_html
+                            page_images.append(image_on_page)
+                            page_text += image_on_page.description
                             added_objects.add(mask_char)
                 # We remove these comments since they are not needed and skew the page numbers
                 page_text = page_text.replace("<!-- PageBreak -->", "")
                 # We remove excess newlines at the beginning and end of the page
                 page_text = page_text.strip()
-                yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
+                yield Page(page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images)
                 offset += len(page_text)
 
     @staticmethod
-    async def figure_to_html(
+    async def process_figure(
         doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
     ) -> str:
         figure_title = (figure.caption and figure.caption.content) or ""
-        logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
+        logger.info("Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__)
         if not figure.bounding_regions:
             return f"<figure><figcaption>{figure_title}</figcaption></figure>"
         if len(figure.bounding_regions) > 1:
@@ -228,7 +234,12 @@ async def figure_to_html(
         page_number = first_region["pageNumber"]  # 1-indexed
         cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
         figure_description = await media_describer.describe_image(cropped_img)
-        return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
+        return ImageOnPage(
+            bytes=cropped_img,
+            filename=f"page_{page_number}_figure_{figure.id}.png",
+            bbox=bounding_box,
+            description=f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
+        )
 
     @staticmethod
     def table_to_html(table: DocumentTable):
@@ -274,10 +285,6 @@ def crop_image_from_pdf_page(
         pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect)
 
         img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
-        # print out the number of pixels
-        print(f"Cropped image size: {img.size} pixels")
         bytes_io = io.BytesIO()
         img.save(bytes_io, format="PNG")
-        with open(f"cropped_page_{page_number + 1}.png", "wb") as f:
-            f.write(bytes_io.getvalue())
         return bytes_io.getvalue()
diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py
diff --git a/infra/main.bicep b/infra/main.bicep