Skip to content

Commit 486d0f3

Browse files
authored
fix: add support of scanned pdf with broken xref (#118)
1 parent 530f5a0 commit 486d0f3

File tree

37 files changed

+1687
-1437
lines changed

37 files changed

+1687
-1437
lines changed

.trivyignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Vulnerability in build dependency, accept the risk
2+
CVE-2025-14009 exp:2026-03-31

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ install_libreoffice:
4646
sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
4747

4848
test: install_nox $(if $(CI), install_libreoffice)
49-
$(POETRY) run nox -s test $(ARGS)
49+
$(POETRY) run nox -- -s test -- $(ARGS)
5050

5151
docker_build:
5252
$(DOCKER) build --platform $(PLATFORM) -t $(IMAGE_NAME):dev .

aidial_rag/document_loaders.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ async def load_attachment(
118118
)
119119

120120

121-
def add_image_only_chunks(
121+
async def add_image_only_chunks(
122122
document_bytes: bytes,
123123
mime_type: str,
124124
existing_chunks: List[Document],
@@ -129,7 +129,7 @@ def add_image_only_chunks(
129129
for i in range(len(existing_chunks) - 1)
130130
)
131131

132-
number_of_pages = extract_number_of_pages(mime_type, document_bytes)
132+
number_of_pages = await extract_number_of_pages(mime_type, document_bytes)
133133
assert all(
134134
1 <= existing_chunk.metadata["page_number"] <= number_of_pages
135135
for existing_chunk in existing_chunks
@@ -179,7 +179,6 @@ def get_document_chunks(
179179
document_bytes: bytes,
180180
mime_type: str,
181181
attachment_link: AttachmentLink,
182-
attachment_mime_type: str,
183182
parser_config: ParserConfig,
184183
) -> List[Document]:
185184
try:
@@ -210,24 +209,8 @@ def get_document_chunks(
210209
except (PDFInfoNotInstalledError, TesseractNotFoundError):
211210
# TODO: Update unstructured library to avoid attempts to use ocr
212211
logging.warning("PDF file without text. Trying to extract images.")
213-
chunks = None
214-
215-
if chunks is None:
216212
chunks = []
217213

218-
if are_image_pages_supported(mime_type):
219-
# We will not have chunks from unstructured for the pages which does not contain text
220-
# So we need to add them manually
221-
chunks = add_image_only_chunks(document_bytes, mime_type, chunks)
222-
223-
if not chunks:
224-
raise InvalidDocumentError("The document is empty")
225-
226-
attachment_filetype = FileType.from_mime_type(attachment_mime_type)
227-
if attachment_filetype == FileType.PDF:
228-
chunks = add_pdf_source_metadata(chunks, attachment_link)
229-
else:
230-
chunks = add_source_metadata(chunks, attachment_link)
231214
return chunks
232215

233216

@@ -249,10 +232,27 @@ async def parse_document(
249232
document_bytes,
250233
mime_type,
251234
attachment_link,
252-
attachment_mime_type,
253235
parser_config,
254236
)
255237

238+
if are_image_pages_supported(mime_type):
239+
# We will not have chunks from unstructured for the pages which does not contain text
240+
# So we need to add them manually
241+
chunks = await add_image_only_chunks(
242+
document_bytes, mime_type, chunks
243+
)
244+
245+
if not chunks:
246+
raise InvalidDocumentError("The document is empty")
247+
248+
# Use attachment_mime_type, not mime_type, because the source
249+
# would point to the original attachment, not the converted document
250+
attachment_filetype = FileType.from_mime_type(attachment_mime_type)
251+
if attachment_filetype == FileType.PDF:
252+
chunks = add_pdf_source_metadata(chunks, attachment_link)
253+
else:
254+
chunks = add_source_metadata(chunks, attachment_link)
255+
256256
# Unstructured does not set filetype for some document types
257257
stageio.write(f"File type: {chunks[0].metadata.get('filetype')}\n")
258258
print_documents_stats(stageio, chunks)

aidial_rag/image_processor/document_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def is_mime_supported(self, mime: str):
1616
return check_mime_type(mime, self.supported_mime_types)
1717

1818
@abstractmethod
19-
def get_number_of_pages(self, file_bytes: bytes) -> int:
19+
async def get_number_of_pages(self, file_bytes: bytes) -> int:
2020
"""
2121
Get number of pages for given document
2222
Parameters:

aidial_rag/image_processor/extract_pages.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ async def extract_pages(
6161
]
6262

6363

64-
def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
65-
return get_extractor(mime_type).get_number_of_pages(document_bytes)
64+
async def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
65+
return await get_extractor(mime_type).get_number_of_pages(document_bytes)
6666

6767

6868
def are_image_pages_supported(mime: str) -> bool:

aidial_rag/image_processor/image_page_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
class ImagePageImageExtractor(DocumentPageImageExtractor):
1414
supported_mime_types: List[str] = ["image/*"]
1515

16-
def get_number_of_pages(self, file_bytes: bytes) -> int:
16+
async def get_number_of_pages(self, file_bytes: bytes) -> int:
1717
return 1
1818

1919
async def extract_pages_gen(
Lines changed: 70 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import asyncio
2-
import io
32
import logging
43
from concurrent.futures import ThreadPoolExecutor
4+
from contextlib import closing
55
from typing import AsyncGenerator, Iterable, List, Optional
66

7-
import pdfplumber
8-
from pdfplumber.page import Page
7+
import pypdfium2 as pdfium
98
from PIL.Image import Image
109

1110
from aidial_rag.image_processor.document_image_extractor import (
@@ -15,6 +14,50 @@
1514
logger = logging.getLogger(__name__)
1615

1716

17+
def _calculate_scale(
18+
width: float, height: float, scaled_size: Optional[int]
19+
) -> float:
20+
"""Calculate scale factor to scale the larger dimension to scaled_size."""
21+
if not scaled_size:
22+
return 1.0
23+
24+
if width > height:
25+
return scaled_size / width
26+
else:
27+
return scaled_size / height
28+
29+
30+
def _get_number_of_pages(file_bytes: bytes) -> int:
31+
# Not thread safe because of pypdfium2
32+
with closing(pdfium.PdfDocument(file_bytes)) as pdf:
33+
return len(pdf)
34+
35+
36+
def _render_page(
37+
file_bytes: bytes,
38+
page_number: int,
39+
scaled_size: Optional[int] = None,
40+
) -> Image:
41+
# Not thread safe because of pypdfium2
42+
with closing(pdfium.PdfDocument(file_bytes)) as pdf:
43+
page = pdf[page_number - 1] # pypdfium2 uses 0-based indexing
44+
45+
scale = _calculate_scale(
46+
page.get_width(), page.get_height(), scaled_size
47+
)
48+
49+
bitmap = page.render(
50+
# scale is float, but default value make pyright think it's int
51+
scale=scale, # pyright: ignore [reportArgumentType]
52+
no_smoothtext=True,
53+
no_smoothpath=True,
54+
no_smoothimage=True,
55+
prefer_bgrx=True,
56+
)
57+
58+
return bitmap.to_pil().convert("RGB")
59+
60+
1861
class PdfPageImageExtractor(DocumentPageImageExtractor):
1962
supported_mime_types: List[str] = ["application/pdf"]
2063

@@ -24,22 +67,11 @@ class PdfPageImageExtractor(DocumentPageImageExtractor):
2467
thread_name_prefix="pdf_page_image_extractor",
2568
)
2669

27-
def get_number_of_pages(self, file_bytes: bytes) -> int:
28-
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
29-
return len(pdf.pages)
30-
31-
def __get_page_image(
32-
self, page: Page, scaled_size: Optional[int] = None
33-
) -> Image:
34-
width = None
35-
height = None
36-
if page.width > page.height:
37-
width = scaled_size
38-
else:
39-
height = scaled_size
40-
41-
# __get_page_image is not thread safe, because to_image is not thread safe
42-
return page.to_image(width=width, height=height).original
70+
async def get_number_of_pages(self, file_bytes: bytes) -> int:
71+
loop = asyncio.get_running_loop()
72+
return await loop.run_in_executor(
73+
self._thread_pool, _get_number_of_pages, file_bytes
74+
)
4375

4476
async def extract_pages_gen(
4577
self,
@@ -48,19 +80,23 @@ async def extract_pages_gen(
4880
scaled_size: Optional[int] = None,
4981
) -> AsyncGenerator[Image, None]:
5082
loop = asyncio.get_running_loop()
51-
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
52-
total_pages = len(pdf.pages)
53-
for page_number in page_numbers:
54-
if not (1 <= page_number <= total_pages):
55-
raise RuntimeError(
56-
f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
57-
)
58-
59-
logger.debug(f"Extracting page {page_number}...")
60-
page = pdf.pages[page_number - 1]
61-
62-
image = await loop.run_in_executor(
63-
self._thread_pool, self.__get_page_image, page, scaled_size
83+
84+
total_pages = await self.get_number_of_pages(file_bytes)
85+
for page_number in page_numbers:
86+
if not (1 <= page_number <= total_pages):
87+
raise RuntimeError(
88+
f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
6489
)
65-
logger.debug(f"Extracted page {page_number} as image")
66-
yield image
90+
91+
logger.debug(f"Extracting page {page_number}...")
92+
93+
# Render in thread pool with a single thread, because pypdfium2 is not thread safe
94+
image = await loop.run_in_executor(
95+
self._thread_pool,
96+
_render_page,
97+
file_bytes,
98+
page_number,
99+
scaled_size,
100+
)
101+
logger.debug(f"Extracted page {page_number} as image")
102+
yield image

aidial_rag/retrievers/page_image_retriever_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ async def extract_page_images(
2727
)
2828
return None
2929

30-
number_of_pages = extract_number_of_pages(mime_type, original_document)
30+
number_of_pages = await extract_number_of_pages(
31+
mime_type, original_document
32+
)
3133

3234
stageio.write("Extracting page images\n")
3335
stageio.write(f"Number of pages: {number_of_pages}\n")

tests/cache/test_app/test_presentation_odp/0a38fb6a0fa55a881e3d29a711a0d1a7.response

Lines changed: 153 additions & 0 deletions
Large diffs are not rendered by default.

tests/cache/test_app/test_presentation_odp/105d5580b43b8b222bd74ce44b667769.response

Lines changed: 104 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)