Add LLM-based media describer

pamelafox · pamelafox · commit 74fdf48b8f50 · 2025-05-29T18:32:39.000Z
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -8,6 +8,7 @@
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
 from rich.logging import RichHandler
+from openai import AsyncAzureOpenAI, AsyncOpenAI
 
 from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
@@ -30,7 +31,7 @@
     LocalListFileStrategy,
 )
 from prepdocslib.parser import Parser
-from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser
+from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser, MediaDescriptionStrategy
 from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
 from prepdocslib.textparser import TextParser
 from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
@@ -178,6 +179,9 @@ def setup_file_processors(
     search_images: bool = False,
     use_content_understanding: bool = False,
     use_multimodal: bool = False,
+    openai_client: Union[AsyncOpenAI, None] = None,
+    openai_model: Union[str, None] = None,
+    openai_deployment: Union[str, None] = None,
     content_understanding_endpoint: Union[str, None] = None,
 ):
     sentence_text_splitter = SentenceTextSplitter()
@@ -191,7 +195,10 @@ def setup_file_processors(
         doc_int_parser = DocumentAnalysisParser(
             endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
             credential=documentintelligence_creds,
-            include_media_description=use_content_understanding or use_multimodal,
+            media_description_strategy = "openai" if use_multimodal else "contentunderstanding" if use_content_understanding else "none",
+            openai_client=openai_client,
+            openai_model=openai_model,
+            openai_deployment=openai_deployment,
             content_understanding_endpoint=content_understanding_endpoint,
         )
 
diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json
@@ -2,6 +2,8 @@
  "embedding": [0, 1, 2],
  "sourcepage": "bla.pdf#page=2",
  "sourcefile": "bla.pdf",
+ "oids": [],
+ "groups": [],
  "images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields
    [ {embedding, url, verbalization, boundingbox},
      {embedding, url, verbalization, boundingbox} ]
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
@@ -1,11 +1,13 @@
 import logging
 from abc import ABC
+import base64
 
 import aiohttp
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import get_bearer_token_provider
 from rich.progress import Progress
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+from openai import AsyncOpenAI
 
 logger = logging.getLogger("scripts")
 
@@ -105,3 +107,31 @@ async def describe_image(self, image_bytes: bytes) -> str:
 
                 fields = results["result"]["contents"][0]["fields"]
                 return fields["Description"]["valueString"]
+
+class MultimodalModelDescriber(MediaDescriber):
+    def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
+        self.openai_client = openai_client
+        self.model = model
+        self.deployment = deployment
+        
+    async def describe_image(self, image_bytes: bytes) -> str:
+        logger.info("Describing image using LLM...")
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        image_datauri = f"data:image/png;base64,{image_base64}"
+
+        response = await self.openai_client.chat.completions.create(
+            model=self.model if self.deployment is None else self.deployment,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that describes images.",
+                },
+                {
+                    "role": "user",
+                    "content": 
+                    [{"text": "Describe this image in detail", "type": "text"},
+                    {"image_url": {"url": image_datauri}, "type": "image_url"}]
+                }
+            ])
+        return response.choices[0].message.content.strip() if response.choices else ""
+
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
@@ -3,7 +3,7 @@
 import logging
 from collections.abc import AsyncGenerator
 from enum import Enum
-from typing import IO, Union
+from typing import IO, Union, Optional
 
 import pymupdf
 from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
@@ -20,7 +20,7 @@
 from pypdf import PdfReader
 from openai import AsyncOpenAI
 
-from .mediadescriber import ContentUnderstandingDescriber
+from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber
 from .page import Page
 from .parser import Parser
 
@@ -45,6 +45,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
             offset += len(page_text)
 
 
+class MediaDescriptionStrategy(Enum):
+    NONE = "none"
+    OPENAI = "openai"
+    CONTENTUNDERSTANDING = "content_understanding"
+
 class DocumentAnalysisParser(Parser):
     """
     Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -57,13 +62,27 @@ def __init__(
         credential: Union[AsyncTokenCredential, AzureKeyCredential],
         model_id="prebuilt-layout",
         include_media_description: bool = False,
+        media_description_strategy: Enum = MediaDescriptionStrategy.NONE,
+        # If using OpenAI, this is the client to use
+        openai_client: Union[AsyncOpenAI, None] = None,
+        openai_model: Optional[str] = None,
+        openai_deployment: Optional[str] = None,
+        # If using Content Understanding, this is the endpoint for the service
         content_understanding_endpoint: Union[str, None] = None,
     ):
         self.model_id = model_id
         self.endpoint = endpoint
         self.credential = credential
-        self.use_content_understanding = use_content_understanding
-        self.content_understanding_endpoint = content_understanding_endpoint
+        self.media_description_strategy = media_description_strategy
+        if media_description_strategy == MediaDescriptionStrategy.OPENAI:
+            logger.info("Including media description with OpenAI")
+            self.use_content_understanding = False
+            self.openai_client = openai_client
+            self.openai_model = openai_model
+            self.openai_deployment = openai_deployment
+        if media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
+            logger.info("Including media description with Azure Content Understanding")
+            self.content_understanding_endpoint = content_understanding_endpoint
 
     async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
         logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
@@ -72,14 +91,23 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
             endpoint=self.endpoint, credential=self.credential
         ) as document_intelligence_client:
             file_analyzed = False
-            if self.use_content_understanding:
+
+            media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None
+            if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
                 if self.content_understanding_endpoint is None:
-                    raise ValueError("Content Understanding is enabled but no endpoint was provided")
+                    raise ValueError("Content Understanding endpoint must be provided when using Content Understanding strategy")
                 if isinstance(self.credential, AzureKeyCredential):
                     raise ValueError(
                         "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
                     )
-                cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
+                media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
+                
+            if self.media_description_strategy == MediaDescriptionStrategy.OPENAI:
+                if self.openai_client is None or self.openai_model is None:
+                    raise ValueError("OpenAI client must be provided when using OpenAI media description strategy")
+                media_describer = MultimodalModelDescriber(self.openai_client, self.openai_model, self.openai_deployment)
+            
+            if media_describer is not None:
                 content_bytes = content.read()
                 try:
                     poller = await document_intelligence_client.begin_analyze_document(
@@ -117,7 +145,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
                     if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
                 ]
                 figures_on_page = []
-                if self.use_content_understanding:
+                if self.media_description_strategy != MediaDescriptionStrategy.NONE:
                     figures_on_page = [
                         figure
                         for figure in (analyze_result.figures or [])
@@ -163,13 +191,13 @@ class ObjectType(Enum):
                             page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx])
                             added_objects.add(mask_char)
                     elif object_type == ObjectType.FIGURE:
-                        if cu_describer is None:
-                            raise ValueError("cu_describer should not be None, unable to describe figure")
+                        if media_describer is None:
+                            raise ValueError("media_describer should not be None, unable to describe figure")
                         if object_idx is None:
                             raise ValueError("Expected object_idx to be set")
                         if mask_char not in added_objects:
                             figure_html = await DocumentAnalysisParser.figure_to_html(
-                                doc_for_pymupdf, figures_on_page[object_idx], cu_describer
+                                doc_for_pymupdf, figures_on_page[object_idx], media_describer
                             )
                             page_text += figure_html
                             added_objects.add(mask_char)
@@ -182,7 +210,7 @@ class ObjectType(Enum):
 
     @staticmethod
     async def figure_to_html(
-        doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber
+        doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
     ) -> str:
         figure_title = (figure.caption and figure.caption.content) or ""
         logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
@@ -200,7 +228,7 @@ async def figure_to_html(
         )
         page_number = first_region["pageNumber"]  # 1-indexed
         cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
-        figure_description = await cu_describer.describe_image(cropped_img)
+        figure_description = await media_describer.describe_image(cropped_img)
         return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
 
     @staticmethod