epam
diff --git a/‎.trivyignore‎
Lines changed: 2 additions & 0 deletions b/‎.trivyignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aidial_rag/document_loaders.py‎
Lines changed: 20 additions & 20 deletions b/‎aidial_rag/document_loaders.py‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎aidial_rag/image_processor/document_image_extractor.py‎
Lines changed: 1 addition & 1 deletion b/‎aidial_rag/image_processor/document_image_extractor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aidial_rag/image_processor/extract_pages.py‎
Lines changed: 2 additions & 2 deletions b/‎aidial_rag/image_processor/extract_pages.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aidial_rag/image_processor/image_page_image_extractor.py‎
Lines changed: 1 addition & 1 deletion b/‎aidial_rag/image_processor/image_page_image_extractor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aidial_rag/image_processor/pdf_page_image_extractor.py‎
Lines changed: 70 additions & 34 deletions b/‎aidial_rag/image_processor/pdf_page_image_extractor.py‎
Lines changed: 70 additions & 34 deletions
diff --git a/‎aidial_rag/retrievers/page_image_retriever_utils.py‎
Lines changed: 3 additions & 1 deletion b/‎aidial_rag/retrievers/page_image_retriever_utils.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/cache/test_app/test_presentation_odp/0a38fb6a0fa55a881e3d29a711a0d1a7.response‎
Lines changed: 153 additions & 0 deletions b/‎tests/cache/test_app/test_presentation_odp/0a38fb6a0fa55a881e3d29a711a0d1a7.response‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎tests/cache/test_app/test_presentation_odp/105d5580b43b8b222bd74ce44b667769.response‎
Lines changed: 104 additions & 0 deletions b/‎tests/cache/test_app/test_presentation_odp/105d5580b43b8b222bd74ce44b667769.response‎
Lines changed: 104 additions & 0 deletions
@@ -0,0 +1,2 @@
+# Vulnerability in build dependency, accept the risk
+CVE-2025-14009 exp:2026-03-31
@@ -46,7 +46,7 @@ install_libreoffice:
 	sudo apt-get install --no-install-recommends -y libreoffice=$(LIBREOFFICE_UBUNTU_VERSION)
 
 test: install_nox $(if $(CI), install_libreoffice)
-	$(POETRY) run nox -s test $(ARGS)
+	$(POETRY) run nox -- -s test -- $(ARGS)
 
 docker_build:
 	$(DOCKER) build --platform $(PLATFORM) -t $(IMAGE_NAME):dev .
 
@@ -118,7 +118,7 @@ async def load_attachment(
     )
 
 
-def add_image_only_chunks(
+async def add_image_only_chunks(
     document_bytes: bytes,
     mime_type: str,
     existing_chunks: List[Document],
@@ -129,7 +129,7 @@ def add_image_only_chunks(
         for i in range(len(existing_chunks) - 1)
     )
 
-    number_of_pages = extract_number_of_pages(mime_type, document_bytes)
+    number_of_pages = await extract_number_of_pages(mime_type, document_bytes)
     assert all(
         1 <= existing_chunk.metadata["page_number"] <= number_of_pages
         for existing_chunk in existing_chunks
@@ -179,7 +179,6 @@ def get_document_chunks(
     document_bytes: bytes,
     mime_type: str,
     attachment_link: AttachmentLink,
-    attachment_mime_type: str,
     parser_config: ParserConfig,
 ) -> List[Document]:
     try:
@@ -210,24 +209,8 @@ def get_document_chunks(
     except (PDFInfoNotInstalledError, TesseractNotFoundError):
         # TODO: Update unstructured library to avoid attempts to use ocr
         logging.warning("PDF file without text. Trying to extract images.")
-        chunks = None
-
-    if chunks is None:
         chunks = []
 
-    if are_image_pages_supported(mime_type):
-        # We will not have chunks from unstructured for the pages which does not contain text
-        # So we need to add them manually
-        chunks = add_image_only_chunks(document_bytes, mime_type, chunks)
-
-    if not chunks:
-        raise InvalidDocumentError("The document is empty")
-
-    attachment_filetype = FileType.from_mime_type(attachment_mime_type)
-    if attachment_filetype == FileType.PDF:
-        chunks = add_pdf_source_metadata(chunks, attachment_link)
-    else:
-        chunks = add_source_metadata(chunks, attachment_link)
     return chunks
 
 
@@ -249,10 +232,27 @@ async def parse_document(
             document_bytes,
             mime_type,
             attachment_link,
-            attachment_mime_type,
             parser_config,
         )
 
+        if are_image_pages_supported(mime_type):
+            # We will not have chunks from unstructured for the pages which does not contain text
+            # So we need to add them manually
+            chunks = await add_image_only_chunks(
+                document_bytes, mime_type, chunks
+            )
+
+        if not chunks:
+            raise InvalidDocumentError("The document is empty")
+
+        # Use attachment_mime_type, not mime_type, because the source
+        # would point to the original attachment, not the converted document
+        attachment_filetype = FileType.from_mime_type(attachment_mime_type)
+        if attachment_filetype == FileType.PDF:
+            chunks = add_pdf_source_metadata(chunks, attachment_link)
+        else:
+            chunks = add_source_metadata(chunks, attachment_link)
+
         # Unstructured does not set filetype for some document types
         stageio.write(f"File type: {chunks[0].metadata.get('filetype')}\n")
         print_documents_stats(stageio, chunks)
 
@@ -16,7 +16,7 @@ def is_mime_supported(self, mime: str):
         return check_mime_type(mime, self.supported_mime_types)
 
     @abstractmethod
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
         """
         Get number of pages for given document
         Parameters:
 
@@ -61,8 +61,8 @@ async def extract_pages(
     ]
 
 
-def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
-    return get_extractor(mime_type).get_number_of_pages(document_bytes)
+async def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:
+    return await get_extractor(mime_type).get_number_of_pages(document_bytes)
 
 
 def are_image_pages_supported(mime: str) -> bool:
 
@@ -13,7 +13,7 @@
 class ImagePageImageExtractor(DocumentPageImageExtractor):
     supported_mime_types: List[str] = ["image/*"]
 
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
         return 1
 
     async def extract_pages_gen(
 
@@ -1,11 +1,10 @@
 import asyncio
-import io
 import logging
 from concurrent.futures import ThreadPoolExecutor
+from contextlib import closing
 from typing import AsyncGenerator, Iterable, List, Optional
 
-import pdfplumber
-from pdfplumber.page import Page
+import pypdfium2 as pdfium
 from PIL.Image import Image
 
 from aidial_rag.image_processor.document_image_extractor import (
@@ -15,6 +14,50 @@
 logger = logging.getLogger(__name__)
 
 
+def _calculate_scale(
+    width: float, height: float, scaled_size: Optional[int]
+) -> float:
+    """Calculate scale factor to scale the larger dimension to scaled_size."""
+    if not scaled_size:
+        return 1.0
+
+    if width > height:
+        return scaled_size / width
+    else:
+        return scaled_size / height
+
+
+def _get_number_of_pages(file_bytes: bytes) -> int:
+    # Not thread safe because of pypdfium2
+    with closing(pdfium.PdfDocument(file_bytes)) as pdf:
+        return len(pdf)
+
+
+def _render_page(
+    file_bytes: bytes,
+    page_number: int,
+    scaled_size: Optional[int] = None,
+) -> Image:
+    # Not thread safe because of pypdfium2
+    with closing(pdfium.PdfDocument(file_bytes)) as pdf:
+        page = pdf[page_number - 1]  # pypdfium2 uses 0-based indexing
+
+        scale = _calculate_scale(
+            page.get_width(), page.get_height(), scaled_size
+        )
+
+        bitmap = page.render(
+            # scale is float, but default value make pyright think it's int
+            scale=scale,  # pyright: ignore [reportArgumentType]
+            no_smoothtext=True,
+            no_smoothpath=True,
+            no_smoothimage=True,
+            prefer_bgrx=True,
+        )
+
+        return bitmap.to_pil().convert("RGB")
+
+
 class PdfPageImageExtractor(DocumentPageImageExtractor):
     supported_mime_types: List[str] = ["application/pdf"]
 
@@ -24,22 +67,11 @@ class PdfPageImageExtractor(DocumentPageImageExtractor):
         thread_name_prefix="pdf_page_image_extractor",
     )
 
-    def get_number_of_pages(self, file_bytes: bytes) -> int:
-        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            return len(pdf.pages)
-
-    def __get_page_image(
-        self, page: Page, scaled_size: Optional[int] = None
-    ) -> Image:
-        width = None
-        height = None
-        if page.width > page.height:
-            width = scaled_size
-        else:
-            height = scaled_size
-
-        # __get_page_image is not thread safe, because to_image is not thread safe
-        return page.to_image(width=width, height=height).original
+    async def get_number_of_pages(self, file_bytes: bytes) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(
+            self._thread_pool, _get_number_of_pages, file_bytes
+        )
 
     async def extract_pages_gen(
         self,
@@ -48,19 +80,23 @@ async def extract_pages_gen(
         scaled_size: Optional[int] = None,
     ) -> AsyncGenerator[Image, None]:
         loop = asyncio.get_running_loop()
-        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-            total_pages = len(pdf.pages)
-            for page_number in page_numbers:
-                if not (1 <= page_number <= total_pages):
-                    raise RuntimeError(
-                        f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
-                    )
-
-                logger.debug(f"Extracting page {page_number}...")
-                page = pdf.pages[page_number - 1]
-
-                image = await loop.run_in_executor(
-                    self._thread_pool, self.__get_page_image, page, scaled_size
+
+        total_pages = await self.get_number_of_pages(file_bytes)
+        for page_number in page_numbers:
+            if not (1 <= page_number <= total_pages):
+                raise RuntimeError(
+                    f"Invalid page number: {page_number}. Page number is ordinal number of the page. The document has {total_pages} pages."
                 )
-                logger.debug(f"Extracted page {page_number} as image")
-                yield image
+
+            logger.debug(f"Extracting page {page_number}...")
+
+            # Render in thread pool with a single thread, because pypdfium2 is not thread safe
+            image = await loop.run_in_executor(
+                self._thread_pool,
+                _render_page,
+                file_bytes,
+                page_number,
+                scaled_size,
+            )
+            logger.debug(f"Extracted page {page_number} as image")
+            yield image
@@ -27,7 +27,9 @@ async def extract_page_images(
         )
         return None
 
-    number_of_pages = extract_number_of_pages(mime_type, original_document)
+    number_of_pages = await extract_number_of_pages(
+        mime_type, original_document
+    )
 
     stageio.write("Extracting page images\n")
     stageio.write(f"Number of pages: {number_of_pages}\n")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Vulnerability in build dependency, accept the risk`
	`2`	`+CVE-2025-14009 exp:2026-03-31`
Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,8 @@ async def extract_pages(`
`61`	`61`	`]`
`62`	`62`
`63`	`63`
`64`		`-def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:`
`65`		`- return get_extractor(mime_type).get_number_of_pages(document_bytes)`
	`64`	`+async def extract_number_of_pages(mime_type: str, document_bytes: bytes) -> int:`
	`65`	`+ return await get_extractor(mime_type).get_number_of_pages(document_bytes)`
`66`	`66`
`67`	`67`
`68`	`68`	`def are_image_pages_supported(mime: str) -> bool:`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,9 @@ async def extract_page_images(`
`27`	`27`	`)`
`28`	`28`	`return None`
`29`	`29`
`30`		`- number_of_pages = extract_number_of_pages(mime_type, original_document)`
	`30`	`+ number_of_pages = await extract_number_of_pages(`
	`31`	`+ mime_type, original_document`
	`32`	`+ )`
`31`	`33`
`32`	`34`	`stageio.write("Extracting page images\n")`
`33`	`35`	`stageio.write(f"Number of pages: {number_of_pages}\n")`