epam · Allob · Feb 27, 2026 · Feb 24, 2026 · Feb 26, 2026 · Feb 27, 2026
@@ -0,0 +1,2 @@
+# Vulnerability in build dependency, accept the risk
+CVE-2025-14009 exp:2026-03-31
@@ -46,7 +46,7 @@ install_libreoffice:
 	sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
 
 test: install_nox $(if $(CI), install_libreoffice)
-	$(POETRY) run nox -s test $(ARGS)
+	$(POETRY) run nox -- -s test -- $(ARGS)
 
 docker_build:
 	$(DOCKER) build --platform $(PLATFORM) -t $(IMAGE_NAME):dev .

@@ -118,7 +118,7 @@ async def load_attachment(
     )
 
 
-def add_image_only_chunks(
+async def add_image_only_chunks(
     document_bytes: bytes,
     mime_type: str,
     existing_chunks: List[Document],
@@ -129,7 +129,7 @@ def add_image_only_chunks(
         for i in range(len(existing_chunks) - 1)
     )
 
-    number_of_pages = extract_number_of_pages(mime_type, document_bytes)
+    number_of_pages = await extract_number_of_pages(mime_type, document_bytes)
     assert all(
         1 <= existing_chunk.metadata["page_number"] <= number_of_pages
         for existing_chunk in existing_chunks
@@ -179,7 +179,6 @@ def get_document_chunks(
     document_bytes: bytes,
     mime_type: str,
     attachment_link: AttachmentLink,
-    attachment_mime_type: str,
     parser_config: ParserConfig,
 ) -> List[Document]:
     try:
@@ -210,24 +209,8 @@ def get_document_chunks(
     except (PDFInfoNotInstalledError, TesseractNotFoundError):
         # TODO: Update unstructured library to avoid attempts to use ocr
         logging.warning("PDF file without text. Trying to extract images.")
-        chunks = None
-
-    if chunks is None:
         chunks = []
 
-    if are_image_pages_supported(mime_type):
-        # We will not have chunks from unstructured for the pages which does not contain text
-        # So we need to add them manually
-        chunks = add_image_only_chunks(document_bytes, mime_type, chunks)
-
-    if not chunks:
-        raise InvalidDocumentError("The document is empty")
-
-    attachment_filetype = FileType.from_mime_type(attachment_mime_type)
-    if attachment_filetype == FileType.PDF:
-        chunks = add_pdf_source_metadata(chunks, attachment_link)
-    else:
-        chunks = add_source_metadata(chunks, attachment_link)
     return chunks
 
 
@@ -249,10 +232,27 @@ async def parse_document(
             document_bytes,
             mime_type,
             attachment_link,
-            attachment_mime_type,
             parser_config,
         )
 
+        if are_image_pages_supported(mime_type):
+            # We will not have chunks from unstructured for the pages which does not contain text
+            # So we need to add them manually
+            chunks = await add_image_only_chunks(
+                document_bytes, mime_type, chunks
+            )
+
+        if not chunks:
+            raise InvalidDocumentError("The document is empty")
+
+        # Use attachment_mime_type, not mime_type, because the source
+        # would point to the original attachment, not the converted document
+        attachment_filetype = FileType.from_mime_type(attachment_mime_type)
+        if attachment_filetype == FileType.PDF:
+            chunks = add_pdf_source_metadata(chunks, attachment_link)
+        else:
+            chunks = add_source_metadata(chunks, attachment_link)
+
         # Unstructured does not set filetype for some document types
         stageio.write(f"File type: {chunks[0].metadata.get('filetype')}\n")
         print_documents_stats(stageio, chunks)

@@ -16,7 +16,7 @@ def is_mime_supported(self, mime: str):
         return check_mime_type(mime, self.supported_mime_types)
 
     @abstractmethod
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
         """
         Get number of pages for given document
         Parameters:

@@ -61,8 +61,8 @@ async def extract_pages(
     ]
 
 
-def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
-    return get_extractor(mime_type).get_number_of_pages(document_bytes)
+async def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
+    return await get_extractor(mime_type).get_number_of_pages(document_bytes)
 
 
 def are_image_pages_supported(mime: str) -> bool:

@@ -13,7 +13,7 @@
 class ImagePageImageExtractor(DocumentPageImageExtractor):
     supported_mime_types: List[str] = ["image/*"]
 
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
         return 1
 
     async def extract_pages_gen(

@@ -1,11 +1,10 @@
 import asyncio
-import io
 import logging
 from concurrent.futures import ThreadPoolExecutor
+from contextlib import closing
 from typing import AsyncGenerator, Iterable, List, Optional
 
-import pdfplumber
-from pdfplumber.page import Page
+import pypdfium2 as pdfium
 from PIL.Image import Image
 
 from aidial_rag.image_processor.document_image_extractor import (
@@ -15,6 +14,50 @@
 logger = logging.getLogger(__name__)
 
 
+def _calculate_scale(
+    width: float, height: float, scaled_size: Optional[int]
+) -> float:
+    """Calculate scale factor to scale the larger dimension to scaled_size."""
+    if not scaled_size:
+        return 1.0
+
+    if width > height:
+        return scaled_size / width
+    else:
+        return scaled_size / height
+
+
+def _get_number_of_pages(file_bytes: bytes) -> int:
+    # Not thread safe because of pypdfium2
+    with closing(pdfium.PdfDocument(file_bytes)) as pdf:
+        return len(pdf)
+
+
+def _render_page(
+    file_bytes: bytes,
+    page_number: int,
+    scaled_size: Optional[int] = None,
+) -> Image:
+    # Not thread safe because of pypdfium2
+    with closing(pdfium.PdfDocument(file_bytes)) as pdf:
+        page = pdf[page_number - 1]  # pypdfium2 uses 0-based indexing
+
+        scale = _calculate_scale(
+            page.get_width(), page.get_height(), scaled_size
+        )
+
+        bitmap = page.render(
+            # scale is float, but default value make pyright think it's int
+            scale=scale,  # pyright: ignore [reportArgumentType]
+            no_smoothtext=True,
+            no_smoothpath=True,
+            no_smoothimage=True,
+            prefer_bgrx=True,
+        )
+
+        return bitmap.to_pil().convert("RGB")
+
+
 class PdfPageImageExtractor(DocumentPageImageExtractor):
     supported_mime_types: List[str] = ["application/pdf"]
 
@@ -24,22 +67,11 @@ class PdfPageImageExtractor(DocumentPageImageExtractor):
         thread_name_prefix="pdf_page_image_extractor",
     )
 
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
-        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            return len(pdf.pages)
-
-    def __get_page_image(
-        self, page: Page, scaled_size: Optional[int] = None
-    ) -> Image:
-        width = None
-        height = None
-        if page.width > page.height:
-            width = scaled_size
-        else:
-            height = scaled_size
-
-        # __get_page_image is not thread safe, because to_image is not thread safe
-        return page.to_image(width=width, height=height).original
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(
+            self._thread_pool, _get_number_of_pages, file_bytes
+        )
 
     async def extract_pages_gen(
         self,
@@ -48,19 +80,23 @@ async def extract_pages_gen(
         scaled_size: Optional[int] = None,
     ) -> AsyncGenerator[Image, None]:
         loop = asyncio.get_running_loop()
-        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            total_pages = len(pdf.pages)
-            for page_number in page_numbers:
-                if not (1 <= page_number <= total_pages):
-                    raise RuntimeError(
-                        f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
-                    )
-
-                logger.debug(f"Extracting page {page_number}...")
-                page = pdf.pages[page_number - 1]
-
-                image = await loop.run_in_executor(
-                    self._thread_pool, self.__get_page_image, page, scaled_size
+
+        total_pages = await self.get_number_of_pages(file_bytes)
+        for page_number in page_numbers:
+            if not (1 <= page_number <= total_pages):
+                raise RuntimeError(
+                    f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
                 )
-                logger.debug(f"Extracted page {page_number} as image")
-                yield image
+
+            logger.debug(f"Extracting page {page_number}...")
+
+            # Render in thread pool with a single thread, because pypdfium2 is not thread safe
+            image = await loop.run_in_executor(
+                self._thread_pool,
+                _render_page,
+                file_bytes,
+                page_number,
+                scaled_size,
+            )
+            logger.debug(f"Extracted page {page_number} as image")
+            yield image
@@ -27,7 +27,9 @@ async def extract_page_images(
         )
         return None
 
-    number_of_pages = extract_number_of_pages(mime_type, original_document)
+    number_of_pages = await extract_number_of_pages(
+        mime_type, original_document
+    )
 
     stageio.write("Extracting page images\n")
     stageio.write(f"Number of pages: {number_of_pages}\n")

@@ -7,9 +7,13 @@
 )
 
 
-def test_number_of_pages():
+@pytest.mark.asyncio
+async def test_number_of_pages():
     with open("tests/data/test_pdf_with_image_and_text.pdf", "rb") as pdf_bytes:
-        num_pages = extract_number_of_pages("application/pdf", pdf_bytes.read())
+        num_pages = await extract_number_of_pages(
+            "application/pdf",
+            pdf_bytes.read(),
+        )
 
     assert num_pages == 1
 

@@ -1,6 +1,9 @@
 import pytest
 
-from aidial_rag.image_processor.extract_pages import extract_pages
+from aidial_rag.image_processor.extract_pages import (
+    extract_number_of_pages,
+    extract_pages,
+)
 
 
 @pytest.mark.asyncio
@@ -65,3 +68,25 @@ async def test_attachment_image_invalid_page2():
                 file_bytes=pdf_bytes.read(),
                 page_numbers=[1, 1],
             )
+
+
+@pytest.mark.asyncio
+async def test_presentation_converted():
+    with open("tests/data/test_presentation_converted.pdf", "rb") as pdf_file:
+        pdf_bytes = pdf_file.read()
+
+    page_numbers = await extract_number_of_pages("application/pdf", pdf_bytes)
+    assert page_numbers == 2
+
+    images = await extract_pages(
+        mime_type="application/pdf",
+        file_bytes=pdf_bytes,
+        page_numbers=[1, 2],
+        scaled_size=800,
+    )
+
+    assert len(images) == 2
+    assert images[0].height == 450
+    assert images[0].width == 800
+    assert images[1].height == 450
+    assert images[1].width == 800
@@ -48,6 +48,19 @@ async def test_load_pdf_with_image_and_no_text(local_server):
     assert chunks[0].metadata["page_number"] == 1
 
 
+@pytest.mark.asyncio
+async def test_load_pdf_with_broken_xref(local_server):
+    """Test loading a PDF with broken xref table
+
+    Some pdf libraries will unable to load this pdf, because they unable to find pages catalog using broken xref table and return empty document. But the PDF can be loaded successfully via sequential file scanning when the xref table is broken.
+    """
+    chunks = await load_document("test_pdf_with_image_broken_xref.pdf")
+    # Should successfully load at least 1 chunk (1 page with image)
+    assert len(chunks) >= 1
+    assert chunks[0].metadata.get("page_number") == 1
+    assert chunks[0].metadata["filetype"] == "application/pdf"
+
+
 @pytest.mark.asyncio
 async def test_load_single_line_text(local_server):
     chunks = await load_document("hello.txt")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Vulnerability in build dependency, accept the risk
		CVE-2025-14009 exp:2026-03-31