Skip to content

Commit 74fdf48

Browse files
committed
Add LLM-based media describer
1 parent b55ca88 commit 74fdf48

File tree

4 files changed

+82
-15
lines changed

4 files changed

+82
-15
lines changed

app/backend/prepdocs.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from azure.core.credentials_async import AsyncTokenCredential
99
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
1010
from rich.logging import RichHandler
11+
from openai import AsyncAzureOpenAI, AsyncOpenAI
1112

1213
from load_azd_env import load_azd_env
1314
from prepdocslib.blobmanager import BlobManager
@@ -30,7 +31,7 @@
3031
LocalListFileStrategy,
3132
)
3233
from prepdocslib.parser import Parser
33-
from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser
34+
from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser, MediaDescriptionStrategy
3435
from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
3536
from prepdocslib.textparser import TextParser
3637
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
@@ -178,6 +179,9 @@ def setup_file_processors(
178179
search_images: bool = False,
179180
use_content_understanding: bool = False,
180181
use_multimodal: bool = False,
182+
openai_client: Union[AsyncOpenAI, None] = None,
183+
openai_model: Union[str, None] = None,
184+
openai_deployment: Union[str, None] = None,
181185
content_understanding_endpoint: Union[str, None] = None,
182186
):
183187
sentence_text_splitter = SentenceTextSplitter()
@@ -191,7 +195,10 @@ def setup_file_processors(
191195
doc_int_parser = DocumentAnalysisParser(
192196
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
193197
credential=documentintelligence_creds,
194-
include_media_description=use_content_understanding or use_multimodal,
198+
media_description_strategy = "openai" if use_multimodal else "contentunderstanding" if use_content_understanding else "none",
199+
openai_client=openai_client,
200+
openai_model=openai_model,
201+
openai_deployment=openai_deployment,
195202
content_understanding_endpoint=content_understanding_endpoint,
196203
)
197204

app/backend/prepdocslib/goals.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
"embedding": [0, 1, 2],
33
"sourcepage": "bla.pdf#page=2",
44
"sourcefile": "bla.pdf",
5+
"oids": [],
6+
"groups": [],
57
"images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields
68
[ {embedding, url, verbalization, boundingbox},
79
{embedding, url, verbalization, boundingbox} ]

app/backend/prepdocslib/mediadescriber.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import logging
22
from abc import ABC
3+
import base64
34

45
import aiohttp
56
from azure.core.credentials_async import AsyncTokenCredential
67
from azure.identity.aio import get_bearer_token_provider
78
from rich.progress import Progress
89
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
10+
from openai import AsyncOpenAI
911

1012
logger = logging.getLogger("scripts")
1113

@@ -105,3 +107,31 @@ async def describe_image(self, image_bytes: bytes) -> str:
105107

106108
fields = results["result"]["contents"][0]["fields"]
107109
return fields["Description"]["valueString"]
110+
111+
class MultimodalModelDescriber(MediaDescriber):
112+
def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
113+
self.openai_client = openai_client
114+
self.model = model
115+
self.deployment = deployment
116+
117+
async def describe_image(self, image_bytes: bytes) -> str:
118+
logger.info("Describing image using LLM...")
119+
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
120+
image_datauri = f"data:image/png;base64,{image_base64}"
121+
122+
response = await self.openai_client.chat.completions.create(
123+
model=self.model if self.deployment is None else self.deployment,
124+
messages=[
125+
{
126+
"role": "system",
127+
"content": "You are a helpful assistant that describes images.",
128+
},
129+
{
130+
"role": "user",
131+
"content":
132+
[{"text": "Describe this image in detail", "type": "text"},
133+
{"image_url": {"url": image_datauri}, "type": "image_url"}]
134+
}
135+
])
136+
return response.choices[0].message.content.strip() if response.choices else ""
137+

app/backend/prepdocslib/pdfparser.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
from collections.abc import AsyncGenerator
55
from enum import Enum
6-
from typing import IO, Union
6+
from typing import IO, Union, Optional
77

88
import pymupdf
99
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
@@ -20,7 +20,7 @@
2020
from pypdf import PdfReader
2121
from openai import AsyncOpenAI
2222

23-
from .mediadescriber import ContentUnderstandingDescriber
23+
from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber
2424
from .page import Page
2525
from .parser import Parser
2626

@@ -45,6 +45,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
4545
offset += len(page_text)
4646

4747

48+
class MediaDescriptionStrategy(Enum):
49+
NONE = "none"
50+
OPENAI = "openai"
51+
CONTENTUNDERSTANDING = "content_understanding"
52+
4853
class DocumentAnalysisParser(Parser):
4954
"""
5055
Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -57,13 +62,27 @@ def __init__(
5762
credential: Union[AsyncTokenCredential, AzureKeyCredential],
5863
model_id="prebuilt-layout",
5964
include_media_description: bool = False,
65+
media_description_strategy: Enum = MediaDescriptionStrategy.NONE,
66+
# If using OpenAI, this is the client to use
67+
openai_client: Union[AsyncOpenAI, None] = None,
68+
openai_model: Optional[str] = None,
69+
openai_deployment: Optional[str] = None,
70+
# If using Content Understanding, this is the endpoint for the service
6071
content_understanding_endpoint: Union[str, None] = None,
6172
):
6273
self.model_id = model_id
6374
self.endpoint = endpoint
6475
self.credential = credential
65-
self.use_content_understanding = use_content_understanding
66-
self.content_understanding_endpoint = content_understanding_endpoint
76+
self.media_description_strategy = media_description_strategy
77+
if media_description_strategy == MediaDescriptionStrategy.OPENAI:
78+
logger.info("Including media description with OpenAI")
79+
self.use_content_understanding = False
80+
self.openai_client = openai_client
81+
self.openai_model = openai_model
82+
self.openai_deployment = openai_deployment
83+
if media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
84+
logger.info("Including media description with Azure Content Understanding")
85+
self.content_understanding_endpoint = content_understanding_endpoint
6786

6887
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
6988
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
@@ -72,14 +91,23 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
7291
endpoint=self.endpoint, credential=self.credential
7392
) as document_intelligence_client:
7493
file_analyzed = False
75-
if self.use_content_understanding:
94+
95+
media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None
96+
if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
7697
if self.content_understanding_endpoint is None:
77-
raise ValueError("Content Understanding is enabled but no endpoint was provided")
98+
raise ValueError("Content Understanding endpoint must be provided when using Content Understanding strategy")
7899
if isinstance(self.credential, AzureKeyCredential):
79100
raise ValueError(
80101
"AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
81102
)
82-
cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
103+
media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
104+
105+
if self.media_description_strategy == MediaDescriptionStrategy.OPENAI:
106+
if self.openai_client is None or self.openai_model is None:
107+
raise ValueError("OpenAI client must be provided when using OpenAI media description strategy")
108+
media_describer = MultimodalModelDescriber(self.openai_client, self.openai_model, self.openai_deployment)
109+
110+
if media_describer is not None:
83111
content_bytes = content.read()
84112
try:
85113
poller = await document_intelligence_client.begin_analyze_document(
@@ -117,7 +145,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
117145
if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
118146
]
119147
figures_on_page = []
120-
if self.use_content_understanding:
148+
if self.media_description_strategy != MediaDescriptionStrategy.NONE:
121149
figures_on_page = [
122150
figure
123151
for figure in (analyze_result.figures or [])
@@ -163,13 +191,13 @@ class ObjectType(Enum):
163191
page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx])
164192
added_objects.add(mask_char)
165193
elif object_type == ObjectType.FIGURE:
166-
if cu_describer is None:
167-
raise ValueError("cu_describer should not be None, unable to describe figure")
194+
if media_describer is None:
195+
raise ValueError("media_describer should not be None, unable to describe figure")
168196
if object_idx is None:
169197
raise ValueError("Expected object_idx to be set")
170198
if mask_char not in added_objects:
171199
figure_html = await DocumentAnalysisParser.figure_to_html(
172-
doc_for_pymupdf, figures_on_page[object_idx], cu_describer
200+
doc_for_pymupdf, figures_on_page[object_idx], media_describer
173201
)
174202
page_text += figure_html
175203
added_objects.add(mask_char)
@@ -182,7 +210,7 @@ class ObjectType(Enum):
182210

183211
@staticmethod
184212
async def figure_to_html(
185-
doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber
213+
doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
186214
) -> str:
187215
figure_title = (figure.caption and figure.caption.content) or ""
188216
logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
@@ -200,7 +228,7 @@ async def figure_to_html(
200228
)
201229
page_number = first_region["pageNumber"] # 1-indexed
202230
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
203-
figure_description = await cu_describer.describe_image(cropped_img)
231+
figure_description = await media_describer.describe_image(cropped_img)
204232
return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
205233

206234
@staticmethod

0 commit comments

Comments
 (0)