Skip to content

Commit ea3ee28

Browse files
committed
More prepdocs improvements for image handling
1 parent 7c8f825 commit ea3ee28

File tree

9 files changed

+133
-97
lines changed

9 files changed

+133
-97
lines changed

app/backend/prepdocslib/blobmanager.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,21 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
5656
if file.url is None:
5757
with open(file.content.name, "rb") as reopened_file:
5858
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
59-
logger.info("Uploading blob for whole file -> %s", blob_name)
59+
logger.info("Uploading blob for document %s", blob_name)
6060
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
6161
file.url = blob_client.url
62-
63-
#if self.store_page_images:
64-
# if os.path.splitext(file.content.name)[1].lower() == ".pdf":
65-
# return await self.upload_pdf_blob_images(service_client, container_client, file)
66-
# else:
67-
# logger.info("File %s is not a PDF, skipping image upload", file.content.name)
68-
62+
return None
63+
64+
async def upload_document_image(self, document_file: File, image_bytes: bytes, image_filename: str) -> Optional[str]:
65+
async with BlobServiceClient(
66+
account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
67+
) as service_client, service_client.get_container_client(self.container) as container_client:
68+
if not await container_client.exists():
69+
await container_client.create_container()
70+
blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename
71+
logger.info("Uploading blob for document image %s", blob_name)
72+
blob_client = await container_client.upload_blob(blob_name, io.BytesIO(image_bytes), overwrite=True)
73+
return blob_client.url
6974
return None
7075

7176
def get_managedidentity_connectionstring(self):

app/backend/prepdocslib/embeddings.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -236,28 +236,24 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]):
236236
self.token_provider = token_provider
237237
self.endpoint = endpoint
238238

239-
async def create_embeddings(self, blob_urls: list[str]) -> list[list[float]]:
239+
async def create_embedding(self, image_bytes: bytes) -> list[float]:
240240
endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage")
241-
headers = {"Content-Type": "application/json"}
242241
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
243-
headers["Authorization"] = "Bearer " + await self.token_provider()
242+
headers = {"Authorization": "Bearer " + await self.token_provider()}
244243

245-
embeddings: list[list[float]] = []
246244
async with aiohttp.ClientSession(headers=headers) as session:
247-
for blob_url in blob_urls:
248-
async for attempt in AsyncRetrying(
249-
retry=retry_if_exception_type(Exception),
245+
async for attempt in AsyncRetrying(
246+
retry=retry_if_exception_type(Exception),
250247
wait=wait_random_exponential(min=15, max=60),
251248
stop=stop_after_attempt(15),
252249
before_sleep=self.before_retry_sleep,
253250
):
254-
with attempt:
255-
body = {"url": blob_url}
256-
async with session.post(url=endpoint, params=params, json=body) as resp:
257-
resp_json = await resp.json()
258-
embeddings.append(resp_json["vector"])
259-
260-
return embeddings
251+
with attempt:
252+
async with session.post(url=endpoint, params=params, data=image_bytes) as resp:
253+
resp_json = await resp.json()
254+
return resp_json["vector"]
255+
256+
return []
261257

262258
def before_retry_sleep(self, retry_state):
263259
logger.info("Rate limited on the Vision embeddings API, sleeping before retrying...")

app/backend/prepdocslib/filestrategy.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ async def parse_file(
1818
file: File,
1919
file_processors: dict[str, FileProcessor],
2020
category: Optional[str] = None,
21-
image_embeddings: Optional[ImageEmbeddings] = None,
21+
blob_manager: Optional[BlobManager] = None,
22+
image_embeddings_client: Optional[ImageEmbeddings] = None,
2223
) -> list[Section]:
2324
key = file.file_extension().lower()
2425
processor = file_processors.get(key)
@@ -27,12 +28,24 @@ async def parse_file(
2728
return []
2829
logger.info("Ingesting '%s'", file.filename())
2930
pages = [page async for page in processor.parser.parse(content=file.content)]
31+
for page in pages:
32+
for image in page.images:
33+
if image.url is None:
34+
image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename)
35+
if image_embeddings_client:
36+
image.embedding = await image_embeddings_client.create_embedding(image.bytes)
3037
logger.info("Splitting '%s' into sections", file.filename())
31-
if image_embeddings:
32-
logger.warning("Each page will be split into smaller chunks of text, but images will be of the entire page.")
3338
sections = [
3439
Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages)
3540
]
41+
# For now, add the images back to each split page based off split_page.page_num
42+
for section in sections:
43+
section.split_page.images = [
44+
image for page in pages if page.page_num == section.split_page.page_num for image in page.images
45+
]
46+
logger.info(
47+
"Section for page %d has %d images", section.split_page.page_num, len(section.split_page.images)
48+
)
3649
return sections
3750

3851

@@ -102,13 +115,9 @@ async def run(self):
102115
files = self.list_file_strategy.list()
103116
async for file in files:
104117
try:
105-
sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
118+
sections = await parse_file(file, self.file_processors, self.category, self.blob_manager, self.image_embeddings)
106119
if sections:
107-
blob_sas_uris = await self.blob_manager.upload_blob(file)
108-
blob_image_embeddings: Optional[list[list[float]]] = None
109-
if self.image_embeddings and blob_sas_uris:
110-
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
111-
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
120+
await self.search_manager.update_content(sections, url=file.url)
112121
finally:
113122
if file:
114123
file.close()

app/backend/prepdocslib/goals.json

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,12 @@
55
"oids": [],
66
"groups": [],
77
"images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields
8-
[ {embedding, url, verbalization, boundingbox},
9-
{embedding, url, verbalization, boundingbox} ]
8+
[ {embedding, url, description, boundingbox},
9+
{embedding, url, description, boundingbox} ]
10+
11+
# Consider gpt-4.1-mini as default: pricier? but relatively not pricey compared to o3 and gpt-4o. run our evals. its better at instruction following.
12+
13+
# Parse each page, get back text with descritpions, associate each page with images on that page
14+
# Each image needs the citation file.pdf#figure=1 via Pillow
15+
# Each image needs to be stored in Blob storage
16+
# Update the search index with all the info

app/backend/prepdocslib/mediadescriber.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ async def create_analyzer(self):
8686
await self.poll_api(session, poll_url, headers)
8787

8888
async def describe_image(self, image_bytes: bytes) -> str:
89-
logger.info("Sending image to Azure Content Understanding service...")
9089
async with aiohttp.ClientSession() as session:
9190
token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
9291
headers = {"Authorization": "Bearer " + token.token}
@@ -115,7 +114,6 @@ def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str):
115114
self.deployment = deployment
116115

117116
async def describe_image(self, image_bytes: bytes) -> str:
118-
logger.info("Describing image using LLM...")
119117
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
120118
image_datauri = f"data:image/png;base64,{image_base64}"
121119

@@ -131,10 +129,9 @@ async def describe_image(self, image_bytes: bytes) -> str:
131129
"role": "user",
132130
"content":
133131
[{"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.", "type": "text"},
134-
{"image_url": {"url": image_datauri}, "type": "image_url", "detail": "low"}]
132+
{"image_url": {"url": image_datauri}, "type": "image_url", "detail": "auto"}]
135133
}
136134
])
137135
description = response.choices[0].message.content.strip() if response.choices else ""
138-
print(description)
139136
return description
140137

app/backend/prepdocslib/page.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
from typing import Sequence
2+
from dataclasses import dataclass, field
3+
4+
5+
@dataclass
6+
class ImageOnPage:
7+
bytes: bytes
8+
bbox: tuple[float, float, float, float]
9+
filename: str
10+
description: str
11+
url: str | None = None
12+
embedding: list[float] | None = None
13+
14+
@dataclass
115
class Page:
216
"""
317
A single page from a document
@@ -7,13 +21,12 @@ class Page:
721
offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
822
text (str): The text of the page
923
"""
24+
page_num: int
25+
offset: int
26+
text: str
27+
images: list[ImageOnPage] = field(default_factory=list)
1028

11-
def __init__(self, page_num: int, offset: int, text: str):
12-
self.page_num = page_num
13-
self.offset = offset
14-
self.text = text
15-
16-
29+
@dataclass
1730
class SplitPage:
1831
"""
1932
A section of a page that has been split into a smaller chunk.
@@ -22,7 +35,6 @@ class SplitPage:
2235
page_num (int): Page number (0-indexed)
2336
text (str): The text of the section
2437
"""
25-
26-
def __init__(self, page_num: int, text: str):
27-
self.page_num = page_num
28-
self.text = text
38+
page_num: int
39+
text: str
40+
images: list[ImageOnPage] = field(default_factory=list)

app/backend/prepdocslib/pdfparser.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from openai import AsyncOpenAI
2222

2323
from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber
24-
from .page import Page
24+
from .page import Page, ImageOnPage
2525
from .parser import Parser
2626

2727
logger = logging.getLogger("scripts")
@@ -50,6 +50,8 @@ class MediaDescriptionStrategy(Enum):
5050
OPENAI = "openai"
5151
CONTENTUNDERSTANDING = "content_understanding"
5252

53+
54+
5355
class DocumentAnalysisParser(Parser):
5456
"""
5557
Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -68,6 +70,7 @@ def __init__(
6870
openai_deployment: Optional[str] = None,
6971
# If using Content Understanding, this is the endpoint for the service
7072
content_understanding_endpoint: Union[str, None] = None,
73+
# should this take the blob storage info too?
7174
):
7275
self.model_id = model_id
7376
self.endpoint = endpoint
@@ -137,6 +140,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
137140
analyze_result: AnalyzeResult = await poller.result()
138141

139142
offset = 0
143+
140144
for page in analyze_result.pages:
141145
tables_on_page = [
142146
table
@@ -150,6 +154,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
150154
for figure in (analyze_result.figures or [])
151155
if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
152156
]
157+
page_images: list[ImageOnPage] = []
153158

154159
class ObjectType(Enum):
155160
NONE = -1
@@ -195,24 +200,25 @@ class ObjectType(Enum):
195200
if object_idx is None:
196201
raise ValueError("Expected object_idx to be set")
197202
if mask_char not in added_objects:
198-
figure_html = await DocumentAnalysisParser.figure_to_html(
203+
image_on_page = await DocumentAnalysisParser.process_figure(
199204
doc_for_pymupdf, figures_on_page[object_idx], media_describer
200205
)
201-
page_text += figure_html
206+
page_images.append(image_on_page)
207+
page_text += image_on_page.description
202208
added_objects.add(mask_char)
203209
# We remove these comments since they are not needed and skew the page numbers
204210
page_text = page_text.replace("<!-- PageBreak -->", "")
205211
# We remove excess newlines at the beginning and end of the page
206212
page_text = page_text.strip()
207-
yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
213+
yield Page(page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images)
208214
offset += len(page_text)
209215

210216
@staticmethod
211-
async def figure_to_html(
217+
async def process_figure(
212218
doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber
213219
) -> str:
214220
figure_title = (figure.caption and figure.caption.content) or ""
215-
logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
221+
logger.info("Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__)
216222
if not figure.bounding_regions:
217223
return f"<figure><figcaption>{figure_title}</figcaption></figure>"
218224
if len(figure.bounding_regions) > 1:
@@ -228,7 +234,12 @@ async def figure_to_html(
228234
page_number = first_region["pageNumber"] # 1-indexed
229235
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
230236
figure_description = await media_describer.describe_image(cropped_img)
231-
return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
237+
return ImageOnPage(
238+
bytes=cropped_img,
239+
filename=f"page_{page_number}_figure_{figure.id}.png",
240+
bbox=bounding_box,
241+
description=f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
242+
)
232243

233244
@staticmethod
234245
def table_to_html(table: DocumentTable):
@@ -274,10 +285,6 @@ def crop_image_from_pdf_page(
274285
pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect)
275286

276287
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
277-
# print out the number of pixels
278-
print(f"Cropped image size: {img.size} pixels")
279288
bytes_io = io.BytesIO()
280289
img.save(bytes_io, format="PNG")
281-
with open(f"cropped_page_{page_number + 1}.png", "wb") as f:
282-
f.write(bytes_io.getvalue())
283290
return bytes_io.getvalue()

0 commit comments

Comments
 (0)