Getting image citations almost working

pamelafox · pamelafox · commit 2a73065f275d · 2025-06-03T17:11:57.000-07:00
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -686,6 +686,7 @@ async def setup_clients():
         agent_client=agent_client,
         openai_client=openai_client,
         auth_helper=auth_helper,
+        images_blob_container_client=image_blob_container_client,
         chatgpt_model=OPENAI_CHATGPT_MODEL,
         chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
         embedding_model=OPENAI_EMB_MODEL,
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -50,6 +50,7 @@ class Document:
     score: Optional[float] = None
     reranker_score: Optional[float] = None
     search_agent_query: Optional[str] = None
+    images: Optional[list[dict[str, Any]]] = None
 
     def serialize_for_results(self) -> dict[str, Any]:
         result_dict = {
@@ -75,6 +76,7 @@ def serialize_for_results(self) -> dict[str, Any]:
             "score": self.score,
             "reranker_score": self.reranker_score,
             "search_agent_query": self.search_agent_query,
+            "images": self.images,
         }
         return result_dict
 
@@ -238,6 +240,7 @@ async def search(
                         captions=cast(list[QueryCaptionResult], document.get("@search.captions")),
                         score=document.get("@search.score"),
                         reranker_score=document.get("@search.reranker_score"),
+                        images=document.get("images"),
                     )
                 )
 
diff --git a/app/backend/approaches/prompts/ask_answer_question.prompty b/app/backend/approaches/prompts/ask_answer_question.prompty
@@ -19,10 +19,12 @@ Use 'you' to refer to the individual asking the questions even if they ask with
 Answer the following question using only the data provided in the sources below.
 Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response.
 If you cannot answer using the sources below, say you don't know. Use below example to answer.
-{% if use_images %}
-Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName:<file_name>.
+{% if image_sources %}
+Each image source has the original document file name in the top left corner of the image with coordinates (10,10) pixels and is in the format Document:<document_name.ext#page=N>.
+The filename of the actual image is in the top right corner of the image and is in the format Figure:<image_name.png>.
 Each text source starts in a new line and has the file name followed by colon and the actual information.
-Always include the source name from the image or text for each fact you use in the response in the format: [filename].
+Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N].
+If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)].
 Answer the following question using only the data provided in the sources below.
 The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned.
 If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts.
@@ -50,6 +52,5 @@ user:
 {% if text_sources is defined %}
 Sources:
 {% for text_source in text_sources %}
-{{ text_source }}
 {% endfor %}
 {% endif %}
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
@@ -3,12 +3,14 @@
 from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.models import VectorQuery
+from azure.storage.blob.aio import ContainerClient
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
 from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
+from core.imageshelper import download_blob_as_base64
 
 
 class RetrieveThenReadApproach(Approach):
@@ -27,6 +29,7 @@ def __init__(
         agent_deployment: Optional[str],
         agent_client: KnowledgeAgentRetrievalClient,
         auth_helper: AuthenticationHelper,
+        images_blob_container_client: ContainerClient,
         openai_client: AsyncOpenAI,
         chatgpt_model: str,
         chatgpt_deployment: Optional[str],  # Not needed for non-Azure OpenAI
@@ -49,6 +52,7 @@ def __init__(
         self.chatgpt_deployment = chatgpt_deployment
         self.openai_client = openai_client
         self.auth_helper = auth_helper
+        self.images_blob_container_client = images_blob_container_client
         self.chatgpt_model = chatgpt_model
         self.embedding_model = embedding_model
         self.embedding_dimensions = embedding_dimensions
@@ -86,7 +90,11 @@ async def run(
         messages = self.prompt_manager.render_prompt(
             self.answer_prompt,
             self.get_system_prompt_variables(overrides.get("prompt_template"))
-            | {"user_query": q, "text_sources": extra_info.data_points.text},
+            | {
+                "user_query": q,
+                "text_sources": extra_info.data_points.text,
+                "image_sources": extra_info.data_points.images,
+            },
         )
 
         chat_completion = cast(
@@ -126,6 +134,7 @@ async def run_search_approach(
         use_semantic_ranker = True if overrides.get("semantic_ranker") else False
         use_query_rewriting = True if overrides.get("query_rewriting") else False
         use_semantic_captions = True if overrides.get("semantic_captions") else False
+        use_multimodal = True  # TODO: if overrides.get("use_multimodal") else False
         top = overrides.get("top", 3)
         minimum_search_score = overrides.get("minimum_search_score", 0.0)
         minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
@@ -137,6 +146,11 @@ async def run_search_approach(
         if use_vector_search:
             vectors.append(await self.compute_text_embedding(q))
 
+        # If multimodal is enabled, also compute image embeddings
+        # TODO: will this work with agentic? is this doing multivector search correctly?
+        # if use_multimodal:
+        #    vectors.append(await self.compute_image_embedding(q))
+
         results = await self.search(
             top,
             q,
@@ -151,10 +165,26 @@ async def run_search_approach(
             use_query_rewriting,
         )
 
-        text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
+        text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=use_multimodal)
+
+        # Extract unique image URLs from results if multimodal is enabled
+
+        seen_urls = set()
+        image_sources = []
+        if use_multimodal:
+            for doc in results:
+                if hasattr(doc, "images") and doc.images:
+                    for img in doc.images:
+                        # Skip if we've already processed this URL
+                        if img["url"] in seen_urls:
+                            continue
+                        seen_urls.add(img["url"])
+                        url = await download_blob_as_base64(self.images_blob_container_client, img["url"])
+                        if url:
+                            image_sources.append(url)
 
         return ExtraInfo(
-            DataPoints(text=text_sources),
+            DataPoints(text=text_sources, images=image_sources),
             thoughts=[
                 ThoughtStep(
                     "Search using user query",
@@ -167,6 +197,7 @@ async def run_search_approach(
                         "filter": filter,
                         "use_vector_search": use_vector_search,
                         "use_text_search": use_text_search,
+                        "use_multimodal": use_multimodal,
                     },
                 ),
                 ThoughtStep(
diff --git a/app/backend/core/imageshelper.py b/app/backend/core/imageshelper.py
@@ -1,14 +1,11 @@
 import base64
 import logging
-import os
 from typing import Optional
 
 from azure.core.exceptions import ResourceNotFoundError
 from azure.storage.blob.aio import ContainerClient
 from typing_extensions import Literal, Required, TypedDict
 
-from approaches.approach import Document
-
 
 class ImageURL(TypedDict, total=False):
     url: Required[str]
@@ -18,23 +15,30 @@ class ImageURL(TypedDict, total=False):
     """Specifies the detail level of the image."""
 
 
-async def download_blob_as_base64(blob_container_client: ContainerClient, file_path: str) -> Optional[str]:
-    base_name, _ = os.path.splitext(file_path)
-    image_filename = base_name + ".png"
+async def download_blob_as_base64(blob_container_client: ContainerClient, blob_url: str) -> Optional[str]:
     try:
-        blob = await blob_container_client.get_blob_client(image_filename).download_blob()
+        # Handle full URLs
+        if blob_url.startswith("http"):
+            # Extract blob path from full URL
+            # URL format: https://{account}.blob.core.windows.net/{container}/{blob_path}
+            url_parts = blob_url.split("/")
+            # Skip the domain parts and container name to get the blob path
+            blob_path = "/".join(url_parts[4:])
+        else:
+            # Treat as a direct blob path
+            blob_path = blob_url
+
+        # Download the blob
+        blob = await blob_container_client.get_blob_client(blob_path).download_blob()
         if not blob.properties:
-            logging.warning(f"No blob exists for {image_filename}")
+            logging.warning(f"No blob exists for {blob_path}")
             return None
+
         img = base64.b64encode(await blob.readall()).decode("utf-8")
         return f"data:image/png;base64,{img}"
     except ResourceNotFoundError:
-        logging.warning(f"No blob exists for {image_filename}")
+        logging.warning(f"No blob exists for {blob_path}")
+        return None
+    except Exception as e:
+        logging.error(f"Error downloading blob {blob_url}: {str(e)}")
         return None
-
-
-async def fetch_image(blob_container_client: ContainerClient, result: Document) -> Optional[str]:
-    if result.sourcepage:
-        img = await download_blob_as_base64(blob_container_client, result.sourcepage)
-        return img
-    return None
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
@@ -1,20 +1,15 @@
-import datetime
 import io
 import logging
 import os
 import re
 from typing import Optional, Union
 
-import pymupdf
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.storage.blob import (
-    BlobSasPermissions,
     UserDelegationKey,
-    generate_blob_sas,
 )
-from azure.storage.blob.aio import BlobServiceClient, ContainerClient
+from azure.storage.blob.aio import BlobServiceClient
 from PIL import Image, ImageDraw, ImageFont
-from pypdf import PdfReader
 
 from .listfilestrategy import File
 
@@ -64,7 +59,7 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
         return None
 
     async def upload_document_image(
-        self, document_file: File, image_bytes: bytes, image_filename: str
+        self, document_file: File, image_bytes: bytes, image_filename: str, image_page_num: int
     ) -> Optional[str]:
         if self.image_container is None:
             raise ValueError(
@@ -75,81 +70,55 @@ async def upload_document_image(
         ) as service_client, service_client.get_container_client(self.image_container) as container_client:
             if not await container_client.exists():
                 await container_client.create_container()
-            blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename
-            logger.info("Uploading blob for document image %s", blob_name)
-            blob_client = await container_client.upload_blob(blob_name, io.BytesIO(image_bytes), overwrite=True)
-            return blob_client.url
-        return None
-
-    def get_managedidentity_connectionstring(self):
-        return f"ResourceId=/subscriptions/{self.subscriptionId}/resourceGroups/{self.resourceGroup}/providers/Microsoft.Storage/storageAccounts/{self.account};"
-
-    async def upload_pdf_blob_images(
-        self, service_client: BlobServiceClient, container_client: ContainerClient, file: File
-    ) -> list[str]:
-        with open(file.content.name, "rb") as reopened_file:
-            reader = PdfReader(reopened_file)
-            page_count = len(reader.pages)
-        doc = pymupdf.open(file.content.name)
-        sas_uris = []
-        start_time = datetime.datetime.now(datetime.timezone.utc)
-        expiry_time = start_time + datetime.timedelta(days=1)
-
-        font = None
-        try:
-            font = ImageFont.truetype("arial.ttf", 20)
-        except OSError:
-            try:
-                font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 20)
-            except OSError:
-                logger.info("Unable to find arial.ttf or FreeMono.ttf, using default font")
-
-        for i in range(page_count):
-            blob_name = BlobManager.blob_image_name_from_file_page(file.content.name, i)
-            logger.info("Converting page %s to image and uploading -> %s", i, blob_name)
-
-            doc = pymupdf.open(file.content.name)
-            page = doc.load_page(i)
-            pix = page.get_pixmap()
-            original_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)  # type: ignore
 
-            # Create a new image with additional space for text
-            text_height = 40  # Height of the text area
-            new_img = Image.new("RGB", (original_img.width, original_img.height + text_height), "white")
+            # Load and modify the image to add text
+            image = Image.open(io.BytesIO(image_bytes))
+            text_height = 40
+            new_img = Image.new("RGB", (image.width, image.height + text_height), "white")
+            new_img.paste(image, (0, text_height))
 
-            # Paste the original image onto the new image
-            new_img.paste(original_img, (0, text_height))
-
-            # Draw the text on the white area
+            # Add text
             draw = ImageDraw.Draw(new_img)
-            text = f"SourceFileName:{blob_name}"
+            sourcepage = BlobManager.sourcepage_from_file_page(document_file.content.name, page=image_page_num)
+            text = f"Document:  {sourcepage}"
 
-            # 10 pixels from the top and left of the image
-            x = 10
-            y = 10
-            draw.text((x, y), text, font=font, fill="black")
+            font = None
+            try:
+                font = ImageFont.truetype("arial.ttf", 24)
+            except OSError:
+                try:
+                    font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 24)
+                except OSError:
+                    logger.info("Unable to find arial.ttf or FreeMono.ttf, using default font")
+
+            # Draw document text on left
+            draw.text((10, 10), text, font=font, fill="black")
+
+            # Draw figure text on right
+            figure_text = f"Figure:  {image_filename}"
+            if font:
+                # Get the width of the text to position it on the right
+                text_width = draw.textlength(figure_text, font=font)
+                draw.text((new_img.width - text_width - 10, 10), figure_text, font=font, fill="black")
+            else:
+                # If no font available, make a best effort to position on right
+                draw.text((new_img.width - 200, 10), figure_text, font=font, fill="black")
 
+            # Convert back to bytes
             output = io.BytesIO()
-            new_img.save(output, format="PNG")
+            new_img.save(output, format=image.format or "PNG")
             output.seek(0)
 
+            blob_name = (
+                f"{self.blob_name_from_file_name(document_file.content.name)}/page{image_page_num}/{image_filename}"
+            )
+            logger.info("Uploading blob for document image %s", blob_name)
             blob_client = await container_client.upload_blob(blob_name, output, overwrite=True)
-            if not self.user_delegation_key:
-                self.user_delegation_key = await service_client.get_user_delegation_key(start_time, expiry_time)
-
-            if blob_client.account_name is not None:
-                sas_token = generate_blob_sas(
-                    account_name=blob_client.account_name,
-                    container_name=blob_client.container_name,
-                    blob_name=blob_client.blob_name,
-                    user_delegation_key=self.user_delegation_key,
-                    permission=BlobSasPermissions(read=True),
-                    expiry=expiry_time,
-                    start=start_time,
-                )
-                sas_uris.append(f"{blob_client.url}?{sas_token}")
-
-        return sas_uris
+            return blob_client.url
+        return None
+
+    def get_managedidentity_connectionstring(self):
+        return f"ResourceId=/subscriptions/{self.subscriptionId}/resourceGroups/{self.resourceGroup}/providers/Microsoft.Storage/storageAccounts/{self.account};"
 
     async def remove_blob(self, path: Optional[str] = None):
         async with BlobServiceClient(
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
@@ -31,7 +31,7 @@ async def parse_file(
     for page in pages:
         for image in page.images:
             if image.url is None:
-                image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename)
+                image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename, image.page_num)
             if image_embeddings_client:
                 image.embedding = await image_embeddings_client.create_embedding(image.bytes)
     logger.info("Splitting '%s' into sections", file.filename())
@@ -43,9 +43,6 @@ async def parse_file(
         section.split_page.images = [
             image for page in pages if page.page_num == section.split_page.page_num for image in page.images
         ]
-        logger.info(
-            "Section for page %d has %d images", section.split_page.page_num, len(section.split_page.images)
-        )
     return sections
 
 
@@ -115,7 +112,9 @@ async def run(self):
             files = self.list_file_strategy.list()
             async for file in files:
                 try:
-                    sections = await parse_file(file, self.file_processors, self.category, self.blob_manager, self.image_embeddings)
+                    sections = await parse_file(
+                        file, self.file_processors, self.category, self.blob_manager, self.image_embeddings
+                    )
                     if sections:
                         await self.search_manager.update_content(sections, url=file.url)
                 finally:
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ class Document:`
`50`	`50`	`score: Optional[float] = None`
`51`	`51`	`reranker_score: Optional[float] = None`
`52`	`52`	`search_agent_query: Optional[str] = None`
	`53`	`+ images: Optional[list[dict[str, Any]]] = None`
`53`	`54`
`54`	`55`	`def serialize_for_results(self) -> dict[str, Any]:`
`55`	`56`	`result_dict = {`
`@@ -75,6 +76,7 @@ def serialize_for_results(self) -> dict[str, Any]:`
`75`	`76`	`"score": self.score,`
`76`	`77`	`"reranker_score": self.reranker_score,`
`77`	`78`	`"search_agent_query": self.search_agent_query,`
	`79`	`+ "images": self.images,`
`78`	`80`	`}`
`79`	`81`	`return result_dict`
`80`	`82`
`@@ -238,6 +240,7 @@ async def search(`
`238`	`240`	`captions=cast(list[QueryCaptionResult], document.get("@search.captions")),`
`239`	`241`	`score=document.get("@search.score"),`
`240`	`242`	`reranker_score=document.get("@search.reranker_score"),`
	`243`	`+ images=document.get("images"),`
`241`	`244`	`)`
`242`	`245`	`)`
`243`	`246`