Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .trivyignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Vulnerability in build dependency, accept the risk
CVE-2025-14009 exp:2026-03-31
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ install_libreoffice:
sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)

test: install_nox $(if $(CI), install_libreoffice)
$(POETRY) run nox -s test $(ARGS)
$(POETRY) run nox -- -s test -- $(ARGS)

docker_build:
$(DOCKER) build --platform $(PLATFORM) -t $(IMAGE_NAME):dev .
Expand Down
40 changes: 20 additions & 20 deletions aidial_rag/document_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ async def load_attachment(
)


def add_image_only_chunks(
async def add_image_only_chunks(
document_bytes: bytes,
mime_type: str,
existing_chunks: List[Document],
Expand All @@ -129,7 +129,7 @@ def add_image_only_chunks(
for i in range(len(existing_chunks) - 1)
)

number_of_pages = extract_number_of_pages(mime_type, document_bytes)
number_of_pages = await extract_number_of_pages(mime_type, document_bytes)
assert all(
1 <= existing_chunk.metadata["page_number"] <= number_of_pages
for existing_chunk in existing_chunks
Expand Down Expand Up @@ -179,7 +179,6 @@ def get_document_chunks(
document_bytes: bytes,
mime_type: str,
attachment_link: AttachmentLink,
attachment_mime_type: str,
parser_config: ParserConfig,
) -> List[Document]:
try:
Expand Down Expand Up @@ -210,24 +209,8 @@ def get_document_chunks(
except (PDFInfoNotInstalledError, TesseractNotFoundError):
# TODO: Update unstructured library to avoid attempts to use ocr
logging.warning("PDF file without text. Trying to extract images.")
chunks = None

if chunks is None:
chunks = []

if are_image_pages_supported(mime_type):
# We will not have chunks from unstructured for the pages which does not contain text
# So we need to add them manually
chunks = add_image_only_chunks(document_bytes, mime_type, chunks)

if not chunks:
raise InvalidDocumentError("The document is empty")

attachment_filetype = FileType.from_mime_type(attachment_mime_type)
if attachment_filetype == FileType.PDF:
chunks = add_pdf_source_metadata(chunks, attachment_link)
else:
chunks = add_source_metadata(chunks, attachment_link)
return chunks


Expand All @@ -249,10 +232,27 @@ async def parse_document(
document_bytes,
mime_type,
attachment_link,
attachment_mime_type,
parser_config,
)

if are_image_pages_supported(mime_type):
# We will not have chunks from unstructured for the pages which does not contain text
# So we need to add them manually
chunks = await add_image_only_chunks(
document_bytes, mime_type, chunks
)

if not chunks:
raise InvalidDocumentError("The document is empty")

# Use attachment_mime_type, not mime_type, because the source
# would point to the original attachment, not the converted document
attachment_filetype = FileType.from_mime_type(attachment_mime_type)
if attachment_filetype == FileType.PDF:
chunks = add_pdf_source_metadata(chunks, attachment_link)
else:
chunks = add_source_metadata(chunks, attachment_link)

# Unstructured does not set filetype for some document types
stageio.write(f"File type: {chunks[0].metadata.get('filetype')}\n")
print_documents_stats(stageio, chunks)
Expand Down
2 changes: 1 addition & 1 deletion aidial_rag/image_processor/document_image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def is_mime_supported(self, mime: str):
return check_mime_type(mime, self.supported_mime_types)

@abstractmethod
def get_number_of_pages(self, file_bytes: bytes) -> int:
async def get_number_of_pages(self, file_bytes: bytes) -> int:
"""
Get number of pages for given document
Parameters:
Expand Down
4 changes: 2 additions & 2 deletions aidial_rag/image_processor/extract_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ async def extract_pages(
]


def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
return get_extractor(mime_type).get_number_of_pages(document_bytes)
async def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
return await get_extractor(mime_type).get_number_of_pages(document_bytes)


def are_image_pages_supported(mime: str) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion aidial_rag/image_processor/image_page_image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
class ImagePageImageExtractor(DocumentPageImageExtractor):
supported_mime_types: List[str] = ["image/*"]

def get_number_of_pages(self, file_bytes: bytes) -> int:
async def get_number_of_pages(self, file_bytes: bytes) -> int:
return 1

async def extract_pages_gen(
Expand Down
104 changes: 70 additions & 34 deletions aidial_rag/image_processor/pdf_page_image_extractor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import asyncio
import io
import logging
from concurrent.futures import ThreadPoolExecutor
from contextlib import closing
from typing import AsyncGenerator, Iterable, List, Optional

import pdfplumber
from pdfplumber.page import Page
import pypdfium2 as pdfium
from PIL.Image import Image

from aidial_rag.image_processor.document_image_extractor import (
Expand All @@ -15,6 +14,50 @@
logger = logging.getLogger(__name__)


def _calculate_scale(
width: float, height: float, scaled_size: Optional[int]
) -> float:
"""Calculate scale factor to scale the larger dimension to scaled_size."""
if not scaled_size:
return 1.0

if width > height:
return scaled_size / width
else:
return scaled_size / height


def _get_number_of_pages(file_bytes: bytes) -> int:
# Not thread safe because of pypdfium2
with closing(pdfium.PdfDocument(file_bytes)) as pdf:
return len(pdf)


def _render_page(
file_bytes: bytes,
page_number: int,
scaled_size: Optional[int] = None,
) -> Image:
# Not thread safe because of pypdfium2
with closing(pdfium.PdfDocument(file_bytes)) as pdf:
page = pdf[page_number - 1] # pypdfium2 uses 0-based indexing

scale = _calculate_scale(
page.get_width(), page.get_height(), scaled_size
)

bitmap = page.render(
# scale is float, but default value make pyright think it's int
scale=scale, # pyright: ignore [reportArgumentType]
no_smoothtext=True,
no_smoothpath=True,
no_smoothimage=True,
prefer_bgrx=True,
)

return bitmap.to_pil().convert("RGB")


class PdfPageImageExtractor(DocumentPageImageExtractor):
supported_mime_types: List[str] = ["application/pdf"]

Expand All @@ -24,22 +67,11 @@ class PdfPageImageExtractor(DocumentPageImageExtractor):
thread_name_prefix="pdf_page_image_extractor",
)

def get_number_of_pages(self, file_bytes: bytes) -> int:
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
return len(pdf.pages)

def __get_page_image(
self, page: Page, scaled_size: Optional[int] = None
) -> Image:
width = None
height = None
if page.width > page.height:
width = scaled_size
else:
height = scaled_size

# __get_page_image is not thread safe, because to_image is not thread safe
return page.to_image(width=width, height=height).original
async def get_number_of_pages(self, file_bytes: bytes) -> int:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(
self._thread_pool, _get_number_of_pages, file_bytes
)

async def extract_pages_gen(
self,
Expand All @@ -48,19 +80,23 @@ async def extract_pages_gen(
scaled_size: Optional[int] = None,
) -> AsyncGenerator[Image, None]:
loop = asyncio.get_running_loop()
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
total_pages = len(pdf.pages)
for page_number in page_numbers:
if not (1 <= page_number <= total_pages):
raise RuntimeError(
f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
)

logger.debug(f"Extracting page {page_number}...")
page = pdf.pages[page_number - 1]

image = await loop.run_in_executor(
self._thread_pool, self.__get_page_image, page, scaled_size

total_pages = await self.get_number_of_pages(file_bytes)
for page_number in page_numbers:
if not (1 <= page_number <= total_pages):
raise RuntimeError(
f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
)
logger.debug(f"Extracted page {page_number} as image")
yield image

logger.debug(f"Extracting page {page_number}...")

# Render in thread pool with a single thread, because pypdfium2 is not thread safe
image = await loop.run_in_executor(
self._thread_pool,
_render_page,
file_bytes,
page_number,
scaled_size,
)
logger.debug(f"Extracted page {page_number} as image")
yield image
4 changes: 3 additions & 1 deletion aidial_rag/retrievers/page_image_retriever_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ async def extract_page_images(
)
return None

number_of_pages = extract_number_of_pages(mime_type, original_document)
number_of_pages = await extract_number_of_pages(
mime_type, original_document
)

stageio.write("Extracting page images\n")
stageio.write(f"Number of pages: {number_of_pages}\n")
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

This file was deleted.

This file was deleted.

This file was deleted.

Large diffs are not rendered by default.

This file was deleted.

Binary file added tests/data/test_pdf_with_image_broken_xref.pdf
Binary file not shown.
Binary file added tests/data/test_presentation_converted.pdf
Binary file not shown.
8 changes: 6 additions & 2 deletions tests/test_extract_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
)


def test_number_of_pages():
@pytest.mark.asyncio
async def test_number_of_pages():
with open("tests/data/test_pdf_with_image_and_text.pdf", "rb") as pdf_bytes:
num_pages = extract_number_of_pages("application/pdf", pdf_bytes.read())
num_pages = await extract_number_of_pages(
"application/pdf",
pdf_bytes.read(),
)

assert num_pages == 1

Expand Down
27 changes: 26 additions & 1 deletion tests/test_image_extractor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pytest

from aidial_rag.image_processor.extract_pages import extract_pages
from aidial_rag.image_processor.extract_pages import (
extract_number_of_pages,
extract_pages,
)


@pytest.mark.asyncio
Expand Down Expand Up @@ -65,3 +68,25 @@ async def test_attachment_image_invalid_page2():
file_bytes=pdf_bytes.read(),
page_numbers=[1, 1],
)


@pytest.mark.asyncio
async def test_presentation_converted():
with open("tests/data/test_presentation_converted.pdf", "rb") as pdf_file:
pdf_bytes = pdf_file.read()

page_numbers = await extract_number_of_pages("application/pdf", pdf_bytes)
assert page_numbers == 2

images = await extract_pages(
mime_type="application/pdf",
file_bytes=pdf_bytes,
page_numbers=[1, 2],
scaled_size=800,
)

assert len(images) == 2
assert images[0].height == 450
assert images[0].width == 800
assert images[1].height == 450
assert images[1].width == 800
13 changes: 13 additions & 0 deletions tests/test_load_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ async def test_load_pdf_with_image_and_no_text(local_server):
assert chunks[0].metadata["page_number"] == 1


@pytest.mark.asyncio
async def test_load_pdf_with_broken_xref(local_server):
"""Test loading a PDF with broken xref table

Some pdf libraries will unable to load this pdf, because they unable to find pages catalog using broken xref table and return empty document. But the PDF can be loaded successfully via sequential file scanning when the xref table is broken.
"""
chunks = await load_document("test_pdf_with_image_broken_xref.pdf")
# Should successfully load at least 1 chunk (1 page with image)
assert len(chunks) >= 1
assert chunks[0].metadata.get("page_number") == 1
assert chunks[0].metadata["filetype"] == "application/pdf"


@pytest.mark.asyncio
async def test_load_single_line_text(local_server):
chunks = await load_document("hello.txt")
Expand Down