diff --git a/.github/workflows/shared-packages.yml b/.github/workflows/shared-packages.yml index cba2a9103..79404be9b 100644 --- a/.github/workflows/shared-packages.yml +++ b/.github/workflows/shared-packages.yml @@ -184,6 +184,7 @@ jobs: env: GOOGLE_DRIVE_CLIENTID_JSON: ${{ secrets.GOOGLE_DRIVE_CLIENTID_JSON }} GOOGLE_SOURCE_UNIT_TEST_FOLDER: ${{ secrets.GOOGLE_SOURCE_UNIT_TEST_FOLDER }} + GOOGLE_DRIVE_TARGET_EMAIL: ${{ secrets.GOOGLE_DRIVE_TARGET_EMAIL }} - name: Test Report uses: mikepenz/action-junit-report@v4 diff --git a/docs/how-to/sources/google-drive.md b/docs/how-to/sources/google-drive.md index 6fab274e1..75ccfb6dd 100644 --- a/docs/how-to/sources/google-drive.md +++ b/docs/how-to/sources/google-drive.md @@ -187,6 +187,43 @@ async def process_drive_documents(): asyncio.run(process_drive_documents()) ``` +## Impersonating Google Accounts + +You can configure your Google service account to impersonate other users in your Google Workspace domain. This is useful when you need to access files or perform actions on behalf of specific users. + +### Step 1: Enable Domain-Wide Delegation + +1. **Sign in to the [Google Admin Console](https://admin.google.com/) as a Super Admin.** +2. Navigate to: + **Security > Access and data control > API controls > MANAGE DOMAIN WIDE DELEGATION** +3. Add a new API client or edit an existing one, and include the following OAuth scopes: + - `https://www.googleapis.com/auth/cloud-platform` + - `https://www.googleapis.com/auth/drive` +4. Click **Authorize** or **Save** to apply the changes. + +### Step 2: Impersonate a User in Your Code + +After configuring domain-wide delegation, you can specify a target user to impersonate when using the `GoogleDriveSource` in your code. + +```python +from ragbits.core.sources.google_drive import GoogleDriveSource + +target_email = "johnDoe@yourdomain.com" +credentials_file = "service-account-key.json" + +# Set the path to your service account key file +GoogleDriveSource.set_credentials_file_path(credentials_file) + +# Set the email address of the user to impersonate +GoogleDriveSource.set_impersonation_target(target_email) +``` + +**Note:** +- The `target_email` must be a valid user in your Google Workspace domain. +- Ensure your service account has been granted domain-wide delegation as described above. + +This setup allows your service account to act on behalf of the specified user, enabling access to their Google Drive files and resources as permitted by the assigned scopes. + ## Troubleshooting ### Common Issues diff --git a/packages/ragbits-core/CHANGELOG.md b/packages/ragbits-core/CHANGELOG.md index f180deae7..55c42e6d4 100644 --- a/packages/ragbits-core/CHANGELOG.md +++ b/packages/ragbits-core/CHANGELOG.md @@ -28,6 +28,7 @@ - Add LLM Usage to LLMResponseWithMetadata (#700) - Split usage per model type (#715) - Add support for batch generation (#608) +- Added Google Drive support for impersonation and presentation-to-pdf (#724) - Introduce new API for attachments in prompts (#711) - Fix issue with trying to store duplicated entries in Vector Stores (#762) diff --git a/packages/ragbits-core/src/ragbits/core/sources/google_drive.py b/packages/ragbits-core/src/ragbits/core/sources/google_drive.py index 144219bf0..ccd3d4f02 100644 --- a/packages/ragbits-core/src/ragbits/core/sources/google_drive.py +++ b/packages/ragbits-core/src/ragbits/core/sources/google_drive.py @@ -1,6 +1,7 @@ -import os # Import os for path joining and potential directory checks +import os from collections.abc import Iterable from contextlib import suppress +from enum import Enum from pathlib import Path from typing import Any, ClassVar @@ -20,38 +21,61 @@ _SCOPES = ["https://www.googleapis.com/auth/drive"] +# Scopes that the service account is delegated for in the Google Workspace Admin Console. +_IMPERSONATION_SCOPES = [ + "https://www.googleapis.com/auth/cloud-platform", # General Cloud access (if needed) + "https://www.googleapis.com/auth/drive", # Example: For Google Drive API +] + # HTTP status codes _HTTP_NOT_FOUND = 404 _HTTP_FORBIDDEN = 403 + +class GoogleDriveExportFormat(str, Enum): + """Supported export MIME types for Google Drive downloads.""" + + DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + PDF = "application/pdf" + PNG = "image/png" + HTML = "text/html" + TXT = "text/plain" + JSON = "application/json" + + # Maps Google-native Drive MIME types → export MIME types -_GOOGLE_EXPORT_MIME_MAP = { - "application/vnd.google-apps.document": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", # noqa: E501 - "application/vnd.google-apps.spreadsheet": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # noqa: E501 - "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", # noqa: E501 - "application/vnd.google-apps.drawing": "image/png", - "application/vnd.google-apps.script": "application/vnd.google-apps.script+json", - "application/vnd.google-apps.site": "text/html", - "application/vnd.google-apps.map": "application/json", - "application/vnd.google-apps.form": "application/pdf", +_GOOGLE_EXPORT_MIME_MAP: dict[str, GoogleDriveExportFormat] = { + "application/vnd.google-apps.document": GoogleDriveExportFormat.DOCX, + "application/vnd.google-apps.spreadsheet": GoogleDriveExportFormat.XLSX, + "application/vnd.google-apps.presentation": GoogleDriveExportFormat.PDF, + "application/vnd.google-apps.drawing": GoogleDriveExportFormat.PNG, + "application/vnd.google-apps.script": GoogleDriveExportFormat.JSON, + "application/vnd.google-apps.site": GoogleDriveExportFormat.HTML, + "application/vnd.google-apps.map": GoogleDriveExportFormat.JSON, + "application/vnd.google-apps.form": GoogleDriveExportFormat.PDF, } # Maps export MIME types → file extensions -_EXPORT_EXTENSION_MAP = { - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", - "image/png": ".png", - "application/pdf": ".pdf", - "text/html": ".html", - "text/plain": ".txt", - "application/json": ".json", +_EXPORT_EXTENSION_MAP: dict[GoogleDriveExportFormat, str] = { + GoogleDriveExportFormat.DOCX: ".docx", + GoogleDriveExportFormat.XLSX: ".xlsx", + GoogleDriveExportFormat.PPTX: ".pptx", + GoogleDriveExportFormat.PNG: ".png", + GoogleDriveExportFormat.PDF: ".pdf", + GoogleDriveExportFormat.HTML: ".html", + GoogleDriveExportFormat.TXT: ".txt", + GoogleDriveExportFormat.JSON: ".json", } class GoogleDriveSource(Source): """ Handles source connection for Google Drive and provides methods to fetch files. + + NOTE(Do not define variables at class level that you pass to google client, define them at instance level, or else + google client will complain.): """ file_id: str @@ -62,12 +86,31 @@ class GoogleDriveSource(Source): _google_drive_client: ClassVar["GoogleAPIResource | None"] = None _credentials_file_path: ClassVar[str | None] = None + impersonate: ClassVar[bool | None] = None + impersonate_target_email: ClassVar[str | None] = None @classmethod def set_credentials_file_path(cls, path: str) -> None: """Set the path to the service account credentials file.""" cls._credentials_file_path = path + @classmethod + def set_impersonation_target(cls, target_mail: str) -> None: + """ + Sets the email address to impersonate when accessing Google Drive resources. + + Args: + target_mail (str): The email address to impersonate. + + Raises: + ValueError: If the provided email address is invalid (empty or missing '@'). + """ + # check if email is a valid email. + if not target_mail or "@" not in target_mail: + raise ValueError("Invalid email address provided for impersonation.") + cls.impersonate = True + cls.impersonate_target_email = target_mail + @classmethod def _initialize_client_from_creds(cls) -> None: """ @@ -82,7 +125,20 @@ def _initialize_client_from_creds(cls) -> None: HttpError: If the Google Drive API is not enabled or accessible. Exception: If any other error occurs during client initialization. """ - creds = service_account.Credentials.from_service_account_file(cls._credentials_file_path, scopes=_SCOPES) + cred_kwargs = { + "filename": cls._credentials_file_path, + "scopes": _SCOPES, + } + + # handle impersonation + if cls.impersonate is not None and cls.impersonate: + if not cls.impersonate_target_email: + raise ValueError("Impersonation target email must be set when impersonation is enabled.") + cred_kwargs["subject"] = cls.impersonate_target_email + cred_kwargs["scopes"] = _IMPERSONATION_SCOPES + + creds = service_account.Credentials.from_service_account_file(**cred_kwargs) + cls._google_drive_client = build("drive", "v3", credentials=creds) cls._google_drive_client.files().list( pageSize=1, fields="files(id)", supportsAllDrives=True, includeItemsFromAllDrives=True @@ -162,7 +218,11 @@ def verify_drive_api_enabled(cls) -> None: @traceable @requires_dependencies(["googleapiclient"], "google_drive") - async def fetch(self) -> Path: + async def fetch( + self, + *, + export_format: "GoogleDriveExportFormat | None" = None, + ) -> Path: """ Fetch the file from Google Drive and store it locally. @@ -171,6 +231,9 @@ async def fetch(self) -> Path: The local directory is determined by the environment variable `LOCAL_STORAGE_DIR`. If this environment variable is not set, a temporary directory is used. + Args: + export_format: Optional override for the export MIME type when downloading Google-native documents. + Returns: The local path to the downloaded file. @@ -186,7 +249,8 @@ async def fetch(self) -> Path: file_local_dir = local_dir / self.file_id file_local_dir.mkdir(parents=True, exist_ok=True) - export_mime_type, file_extension = self._determine_file_extension() + override_mime = export_format.value if export_format else None + export_mime_type, file_extension = self._determine_file_extension(override_mime=override_mime) local_file_name = f"{self.file_name}{file_extension}" path = file_local_dir / local_file_name @@ -496,22 +560,36 @@ async def from_uri(cls, path: str) -> Iterable[Self]: else: raise ValueError(f"Unsupported Google Drive URI pattern: {path}") - def _determine_file_extension(self) -> tuple[str, str]: + def _determine_file_extension(self, override_mime: str | None = None) -> tuple[str, str]: """ Determine the appropriate file extension and export MIME type for the file. Returns: A tuple of (export_mime_type, file_extension) """ + if override_mime is not None: + export_mime_type = override_mime + try: + export_format = GoogleDriveExportFormat(override_mime) + file_extension = _EXPORT_EXTENSION_MAP.get(export_format, ".bin") + except ValueError: + file_extension = Path(self.file_name).suffix if "." in self.file_name else ".bin" + return export_mime_type, file_extension + export_mime_type = self.mime_type file_extension = "" if self.mime_type.startswith("application/vnd.google-apps"): - export_mime_type = _GOOGLE_EXPORT_MIME_MAP.get(self.mime_type, "application/pdf") - file_extension = _EXPORT_EXTENSION_MAP.get(export_mime_type, ".bin") + export_format = _GOOGLE_EXPORT_MIME_MAP.get(self.mime_type, GoogleDriveExportFormat.PDF) + export_mime_type = export_format.value + file_extension = _EXPORT_EXTENSION_MAP.get(export_format, ".bin") elif "." in self.file_name: file_extension = Path(self.file_name).suffix else: - file_extension = _EXPORT_EXTENSION_MAP.get(self.mime_type, ".bin") + try: + export_format = GoogleDriveExportFormat(self.mime_type) + file_extension = _EXPORT_EXTENSION_MAP.get(export_format, ".bin") + except ValueError: + file_extension = ".bin" return export_mime_type, file_extension diff --git a/packages/ragbits-core/tests/unit/sources/test_google_drive.py b/packages/ragbits-core/tests/unit/sources/test_google_drive.py index 223ae2ec6..776dd5efb 100644 --- a/packages/ragbits-core/tests/unit/sources/test_google_drive.py +++ b/packages/ragbits-core/tests/unit/sources/test_google_drive.py @@ -1,11 +1,11 @@ -import json # Import json for potential validation or pretty printing +import json import os from pathlib import Path import pytest from googleapiclient.errors import HttpError -from ragbits.core.sources.google_drive import GoogleDriveSource +from ragbits.core.sources.google_drive import GoogleDriveExportFormat, GoogleDriveSource @pytest.fixture(autouse=True) @@ -50,6 +50,58 @@ def setup_local_storage_dir(tmp_path: Path): del os.environ["LOCAL_STORAGE_DIR"] +@pytest.mark.asyncio +async def test_google_drive_impersonate(): + """Test service account impersonation with better error handling.""" + target_email = os.environ.get("GOOGLE_DRIVE_TARGET_EMAIL") + credentials_file = "test_clientid.json" + + GoogleDriveSource.set_credentials_file_path(credentials_file) + + if target_email is None: + pytest.skip("GOOGLE_DRIVE_TARGET_EMAIL environment variable not set") + + GoogleDriveSource.set_impersonation_target(target_email) + + unit_test_folder_id = os.environ.get("GOOGLE_SOURCE_UNIT_TEST_FOLDER") + + if unit_test_folder_id is None: + pytest.skip("GOOGLE_SOURCE_UNIT_TEST_FOLDER environment variable not set") + + sources_to_download = await GoogleDriveSource.from_uri(f"{unit_test_folder_id}/**") + downloaded_count = 0 + + try: + # Iterate through each source (file or folder) found + for source in sources_to_download: + # Only attempt to fetch files, as folders cannot be "downloaded" in the same way + if not source.is_folder: + try: + # Attempt to fetch (download) the file. + local_path = await source.fetch() + print(f" Downloaded: '{source.file_name}' (ID: {source.file_id}) to '{local_path}'") + downloaded_count += 1 + except HttpError as e: + # Catch Google API specific HTTP errors (e.g., permission denied, file not found) + print(f" Google API Error downloading '{source.file_name}' (ID: {source.file_id}): {e}") + except Exception as e: + # Catch any other general exceptions during the download process + print(f" Failed to download '{source.file_name}' (ID: {source.file_id}): {e}") + else: + print(f" Skipping folder: '{source.file_name}' (ID: {source.file_id})") + + except Exception as e: + # Catch any exceptions that occur during the initial setup or `from_uri` call + print(f"An error occurred during test setup or source retrieval: {e}") + + finally: + # This block ensures the final summary is printed regardless of errors + print(f"\n--- Successfully downloaded {downloaded_count} files from '{unit_test_folder_id}' ---") + # Assert that at least one file was downloaded if that's an expectation for the test + # If no files are expected, or it's acceptable for 0 files to be downloaded, remove or adjust this assertion. + assert downloaded_count > 0, "Expected to download at least one file, but downloaded 0." + + @pytest.mark.asyncio async def test_google_drive_source_fetch_file_not_found(): """Test fetching a non-existent file.""" @@ -99,6 +151,9 @@ async def test_google_drive_source_fetch_file(): """ unit_test_folder_id = os.environ.get("GOOGLE_SOURCE_UNIT_TEST_FOLDER") + if unit_test_folder_id is None: + pytest.skip("GOOGLE_SOURCE_UNIT_TEST_FOLDER environment variable not set") + # Initialize a counter for successfully downloaded files downloaded_count = 0 @@ -141,3 +196,17 @@ async def test_google_drive_source_fetch_file(): # Assert that at least one file was downloaded if that's an expectation for the test # If no files are expected, or it's acceptable for 0 files to be downloaded, remove or adjust this assertion. assert downloaded_count > 0, "Expected to download at least one file, but downloaded 0." + + +def test_determine_file_extension_override(): + """Ensure overriding export MIME type yields expected extension.""" + src = GoogleDriveSource( + file_id="dummy", + file_name="MyDoc", + mime_type="application/vnd.google-apps.document", + ) + + export_mime, extension = src._determine_file_extension(override_mime=GoogleDriveExportFormat.PDF.value) + + assert export_mime == GoogleDriveExportFormat.PDF.value + assert extension == ".pdf" diff --git a/packages/ragbits-document-search/CHANGELOG.md b/packages/ragbits-document-search/CHANGELOG.md index cfbafdb35..d5a6678b7 100644 --- a/packages/ragbits-document-search/CHANGELOG.md +++ b/packages/ragbits-document-search/CHANGELOG.md @@ -2,6 +2,8 @@ ## Unreleased +- feat: add pptx document parser (#693) + ## 1.2.2 (2025-08-08) ### Changed @@ -199,6 +201,7 @@ ## 0.7.0 (2025-01-21) ### Added + - Add CLI command to perform search on DocumentSearch instance (#290). ### Changed @@ -222,7 +225,7 @@ ### Added -- Distributed ingestion with usage of https://www.ray.io/ (#207) +- Distributed ingestion with usage of (#207) - Documents can be now replaced in existing VectorStore (#210) ### Changed @@ -248,7 +251,6 @@ - Add location metadata to documents ingested into DocumentSearch (#122). - Add LiteLLM Reranker (#109). - ### Changed - ragbits-core updated to version v0.3.0 diff --git a/packages/ragbits-document-search/pyproject.toml b/packages/ragbits-document-search/pyproject.toml index cd9cdfad8..0657c42b1 100644 --- a/packages/ragbits-document-search/pyproject.toml +++ b/packages/ragbits-document-search/pyproject.toml @@ -5,9 +5,7 @@ description = "Document Search module for Ragbits" readme = "README.md" requires-python = ">=3.10" license = "MIT" -authors = [ - { name = "deepsense.ai", email = "ragbits@deepsense.ai"} -] +authors = [{ name = "deepsense.ai", email = "ragbits@deepsense.ai" }] keywords = [ "Retrieval Augmented Generation", "RAG", @@ -15,7 +13,7 @@ keywords = [ "LLMs", "Generative AI", "GenAI", - "Document Search" + "Document Search", ] classifiers = [ "Development Status :: 4 - Beta", @@ -31,7 +29,14 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", ] -dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.2.2"] +dependencies = [ + "docling>=2.15.1,<3.0.0", + "opencv-python>=4.11.0.86,<5.0.0.0", + "rerankers>=0.6.1,<1.0.0", + "filetype>=1.2.0,<2.0.0", + "ragbits-core==1.2.2", + "python-pptx>=1.0.0,<2.0.0", +] [project.urls] "Homepage" = "https://github.com/deepsense-ai/ragbits" @@ -44,9 +49,7 @@ unstructured = [ "unstructured>=0.16.9,<1.0.0", "unstructured-client>=0.26.0,<1.0.0", ] -ray = [ - "ray[data]>=2.43.0,<3.0.0", -] +ray = ["ray[data]>=2.43.0,<3.0.0"] [tool.uv] dev-dependencies = [ diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py index e2a5ef78d..41ddcadb4 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py @@ -1,4 +1,9 @@ from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter -__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"] +__all__ = [ + "DocumentParser", + "DocumentParserRouter", + "ImageDocumentParser", + "TextDocumentParser", +] diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py new file mode 100644 index 000000000..c5f815821 --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py @@ -0,0 +1,28 @@ +from .callbacks import PptxCallback +from .exceptions import ( + PptxExtractionError, + PptxParserError, + PptxPresentationError, +) +from .hyperlink_callback import LinkCallback +from .metadata_callback import MetaCallback +from .parser import PptxDocumentParser +from .speaker_notes_callback import NotesCallback + +DEFAULT_CALLBACKS = [ + NotesCallback(), + LinkCallback(), + MetaCallback(), +] + +__all__ = [ + "DEFAULT_CALLBACKS", + "LinkCallback", + "MetaCallback", + "NotesCallback", + "PptxCallback", + "PptxDocumentParser", + "PptxExtractionError", + "PptxParserError", + "PptxPresentationError", +] diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/callbacks.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/callbacks.py new file mode 100644 index 000000000..7988ae454 --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/callbacks.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path + +from docling_core.types.doc import DoclingDocument +from pptx.presentation import Presentation + + +class PptxCallback(ABC): + """ + Abstract base class for PPTX document enhancement callbacks. + """ + + name: str + + @abstractmethod + def __call__( + self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument + ) -> DoclingDocument: + """ + Process PPTX presentation and enhance the docling document. + + Args: + pptx_path: Path to the PPTX file. + presentation: Loaded PPTX presentation. + docling_document: Document to enhance. + + Returns: + Enhanced docling document. + """ + pass diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/exceptions.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/exceptions.py new file mode 100644 index 000000000..1fa9b1db1 --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/exceptions.py @@ -0,0 +1,52 @@ +from ragbits.document_search.ingestion.parsers.exceptions import ParserError + + +class PptxParserError(ParserError): + """ + Base class for all PPTX parser related exceptions. + """ + + +class PptxExtractionError(PptxParserError): + """ + Raised when an extractor fails to extract content from a shape or slide. + """ + + def __init__(self, extractor_name: str, slide_idx: int, shape_info: str, original_error: Exception) -> None: + """ + Initialize the PptxExtractionError. + + Args: + extractor_name: Name of the extractor that failed. + slide_idx: Index of the slide where extraction failed. + shape_info: Information about the shape that caused the failure. + original_error: The original exception that caused the failure. + """ + message = ( + f"Extractor '{extractor_name}' failed to extract content from slide {slide_idx}. " + f"Shape info: {shape_info}. Original error: {original_error}" + ) + super().__init__(message) + self.extractor_name = extractor_name + self.slide_idx = slide_idx + self.shape_info = shape_info + self.original_error = original_error + + +class PptxPresentationError(PptxParserError): + """ + Raised when the PPTX presentation cannot be loaded or processed. + """ + + def __init__(self, file_path: str, original_error: Exception) -> None: + """ + Initialize the PptxPresentationError. + + Args: + file_path: Path to the PPTX file that failed to load. + original_error: The original exception that caused the failure. + """ + message = f"Failed to load or process PPTX presentation from '{file_path}'. Original error: {original_error}" + super().__init__(message) + self.file_path = file_path + self.original_error = original_error diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py new file mode 100644 index 000000000..e7828b66c --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/hyperlink_callback.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem +from pptx.presentation import Presentation +from pptx.shapes.group import GroupShape + +from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback +from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError + +logger = logging.getLogger(__name__) + + +class LinkCallback(PptxCallback): + """ + Callback to extract hyperlinks from PPTX shapes. + """ + + name = "link_callback" + + def __call__( + self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument + ) -> DoclingDocument: + """ + Extract hyperlinks from all shapes and add them to the docling document. + + Args: + pptx_path: Path to the PPTX file. + presentation: Loaded PPTX presentation. + docling_document: Document to enhance with hyperlinks. + + Returns: + Enhanced docling document with hyperlinks. + """ + hyperlinks_added = 0 + + for slide_idx, slide in enumerate(presentation.slides, start=1): + for shape in slide.shapes: + try: + hyperlink_address = self._extract_hyperlink_address(shape) + if hyperlink_address: + link_text = f"Link: {hyperlink_address}" + hyperlink_item = TextItem( + self_ref=f"#/links/{slide_idx + hyperlinks_added}", + text=link_text, + orig=link_text, + label=DocItemLabel.TEXT, + prov=[ + ProvenanceItem( + page_no=slide_idx, + bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0), + charspan=(0, len(link_text)), + ) + ], + ) + + docling_document.texts.append(hyperlink_item) + hyperlinks_added += 1 + + logger.debug("Added hyperlink from slide %d: %s", slide_idx, hyperlink_address) + + except (AttributeError, TypeError) as e: + extraction_error = PptxExtractionError(self.name, slide_idx, "hyperlink from shape", e) + logger.debug( + "Failed to extract hyperlink from shape on slide %d: %s", slide_idx, str(extraction_error) + ) + continue + + if hyperlinks_added > 0: + logger.info("Successfully added %d hyperlinks to docling document", hyperlinks_added) + else: + logger.debug("No hyperlinks found in presentation") + + return docling_document + + @staticmethod + def _extract_hyperlink_address(shape: object) -> str | None: + if not hasattr(shape, "click_action") or isinstance(shape, GroupShape): + return None + if not shape.click_action.hyperlink or not shape.click_action.hyperlink.address: + return None + return shape.click_action.hyperlink.address diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py new file mode 100644 index 000000000..3cd705693 --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/metadata_callback.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem +from pptx.presentation import Presentation + +from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback +from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError + +logger = logging.getLogger(__name__) + + +class MetaCallback(PptxCallback): + """ + Callback to extract presentation metadata from PPTX files. + """ + + name = "meta_callback" + + def __call__( + self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument + ) -> DoclingDocument: + """ + Extract presentation metadata and add it to the docling document. + + Args: + pptx_path: Path to the PPTX file. + presentation: Loaded PPTX presentation. + docling_document: Document to enhance with metadata. + + Returns: + Enhanced docling document with metadata. + """ + metadata_added = 0 + + try: + core_properties = presentation.core_properties + properties = [ + ("author", core_properties.author), + ("title", core_properties.title), + ("subject", core_properties.subject), + ("keywords", core_properties.keywords), + ("category", core_properties.category), + ("created", str(core_properties.created) if core_properties.created else None), + ("modified", str(core_properties.modified) if core_properties.modified else None), + ] + + for prop_name, prop_value in properties: + if prop_value is not None and str(prop_value).strip(): + meta_text = f"{prop_name}: {prop_value}" + metadata_item = TextItem( + self_ref=f"#/metadata/{metadata_added}", + text=meta_text, + orig=meta_text, + label=DocItemLabel.TEXT, + prov=[ + ProvenanceItem( + page_no=0, bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0), charspan=(0, len(meta_text)) + ) + ], + ) + + docling_document.texts.append(metadata_item) + metadata_added += 1 + + logger.debug("Added metadata: %s = %s", prop_name, prop_value) + except (AttributeError, TypeError) as e: + extraction_error = PptxExtractionError(self.name, 0, "presentation metadata", e) + logger.debug("Failed to extract presentation metadata: %s", str(extraction_error)) + + if metadata_added > 0: + logger.info("Successfully added %d metadata properties to docling document", metadata_added) + else: + logger.debug("No metadata found in presentation") + + return docling_document diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/parser.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/parser.py new file mode 100644 index 000000000..dca59aac6 --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/parser.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import logging + +from docling.datamodel.base_models import InputFormat +from docling.document_converter import FormatOption +from docling_core.transforms.chunker.base import BaseChunker +from docling_core.types.doc import DoclingDocument +from pptx import Presentation + +from ragbits.document_search.documents.document import Document, DocumentType +from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser +from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback +from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError, PptxPresentationError + +logger = logging.getLogger(__name__) + + +class PptxDocumentParser(DoclingDocumentParser): + """ + Document parser for PPTX files with callback-based enhancement. + """ + + supported_document_types = {DocumentType.PPTX} + + def __init__( + self, + ignore_images: bool = False, + num_threads: int = 1, + chunker: BaseChunker | None = None, + format_options: dict[InputFormat, FormatOption] | None = None, + pptx_callbacks: list[PptxCallback] | None = None, + ) -> None: + super().__init__( + ignore_images=ignore_images, + num_threads=num_threads, + chunker=chunker, + format_options=format_options, + ) + + if pptx_callbacks is None: + from ragbits.document_search.ingestion.parsers.pptx import DEFAULT_CALLBACKS + + self.pptx_callbacks = DEFAULT_CALLBACKS + else: + self.pptx_callbacks = pptx_callbacks + + logger.debug("Initialized PptxDocumentParser with %d callbacks", len(self.pptx_callbacks)) + + async def _partition(self, document: Document) -> DoclingDocument: + docling_document = await super()._partition(document) + + if not self.pptx_callbacks: + return docling_document + + logger.info("Enhancing docling document with %d callbacks", len(self.pptx_callbacks)) + + try: + presentation = Presentation(document.local_path.as_posix()) + except Exception as e: + logger.error("Failed to load presentation for callbacks: %s", str(e)) + raise PptxPresentationError(str(document.local_path), e) from e + + successful_callbacks = 0 + for callback in self.pptx_callbacks: + try: + logger.debug("Running callback: %s", callback.name) + docling_document = callback(document.local_path, presentation, docling_document) + successful_callbacks += 1 + logger.debug("Successfully applied callback: %s", callback.name) + except Exception as e: + extraction_error = PptxExtractionError(callback.name, -1, "callback execution", e) + logger.error( + "Callback %s failed: %s. Continuing with other callbacks.", + callback.name, + str(extraction_error), + exc_info=True, + ) + + logger.info( + "Enhanced docling document with %d/%d successful callbacks", + successful_callbacks, + len(self.pptx_callbacks), + ) + return docling_document diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py new file mode 100644 index 000000000..07cb9b37a --- /dev/null +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/speaker_notes_callback.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from docling_core.types.doc import BoundingBox, DocItemLabel, DoclingDocument, ProvenanceItem, TextItem +from pptx.presentation import Presentation + +from ragbits.document_search.ingestion.parsers.pptx.callbacks import PptxCallback +from ragbits.document_search.ingestion.parsers.pptx.exceptions import PptxExtractionError + +logger = logging.getLogger(__name__) + + +class NotesCallback(PptxCallback): + """ + Callback to extract speaker notes from PPTX slides. + """ + + name = "notes_callback" + + def __call__( + self, pptx_path: Path, presentation: Presentation, docling_document: DoclingDocument + ) -> DoclingDocument: + """ + Extract speaker notes from all slides and add them to the docling document. + + Args: + pptx_path: Path to the PPTX file. + presentation: Loaded PPTX presentation. + docling_document: Document to enhance with speaker notes. + + Returns: + Enhanced docling document with speaker notes. + """ + notes_added = 0 + + for slide_idx, slide in enumerate(presentation.slides, start=1): + try: + if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None: + notes_text_frame = slide.notes_slide.notes_text_frame + text = getattr(notes_text_frame, "text", None) + text = text.strip() if text else None + + if text: + notes_item = TextItem( + self_ref=f"#/notes/{slide_idx}", + text=text, + orig=text, + label=DocItemLabel.TEXT, + prov=[ + ProvenanceItem( + page_no=slide_idx, + bbox=BoundingBox(l=0.0, t=0.0, r=1.0, b=1.0), + charspan=(0, len(text)), + ) + ], + ) + + docling_document.texts.append(notes_item) + notes_added += 1 + + logger.debug("Added speaker notes from slide %d", slide_idx) + + except (AttributeError, TypeError) as e: + extraction_error = PptxExtractionError(self.name, slide_idx, "speaker notes", e) + logger.debug("Failed to extract speaker notes from slide %d: %s", slide_idx, str(extraction_error)) + continue + + if notes_added > 0: + logger.info("Successfully added %d speaker notes to docling document", notes_added) + else: + logger.debug("No speaker notes found in presentation") + + return docling_document diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py index 2c4456e67..a30630f4d 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py @@ -8,6 +8,7 @@ from ragbits.document_search.ingestion.parsers.base import DocumentParser from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError +from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser _default_parser = DoclingDocumentParser() @@ -16,7 +17,7 @@ DocumentType.MD: _default_parser, DocumentType.PDF: _default_parser, DocumentType.DOCX: _default_parser, - DocumentType.PPTX: _default_parser, + DocumentType.PPTX: PptxDocumentParser(), DocumentType.XLSX: _default_parser, DocumentType.HTML: _default_parser, DocumentType.JPG: _default_parser, diff --git a/packages/ragbits-document-search/tests/assets/pptx/sample_presentation.pptx b/packages/ragbits-document-search/tests/assets/pptx/sample_presentation.pptx new file mode 100644 index 000000000..20b3202bf Binary files /dev/null and b/packages/ragbits-document-search/tests/assets/pptx/sample_presentation.pptx differ diff --git a/packages/ragbits-document-search/tests/integration/test_pptx_parser.py b/packages/ragbits-document-search/tests/integration/test_pptx_parser.py new file mode 100644 index 000000000..08891ae81 --- /dev/null +++ b/packages/ragbits-document-search/tests/integration/test_pptx_parser.py @@ -0,0 +1,79 @@ +from pathlib import Path + +import pytest +from pptx import Presentation +from pptx.shapes.group import GroupShape + +from ragbits.document_search.documents.document import DocumentMeta +from ragbits.document_search.documents.element import TextElement +from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser + + +def _normalize_whitespace(value: str) -> str: + """Normalize whitespace for robust substring checks.""" + return " ".join(value.split()) + + +@pytest.mark.asyncio +async def test_pptx_parser_callbacks_integration() -> None: + """Validate PPTX callbacks using a sample presentation asset.""" + pptx_path = Path(__file__).parent.parent / "assets" / "pptx" / "sample_presentation.pptx" + assert pptx_path.exists() + + presentation = Presentation(pptx_path.as_posix()) + + expected_notes: list[str] = [] + for slide in presentation.slides: + notes_slide = getattr(slide, "notes_slide", None) + notes_text_frame = getattr(notes_slide, "notes_text_frame", None) + text = getattr(notes_text_frame, "text", None) + if not text: + continue + text = text.strip() + if text: + expected_notes.append(text) + + expected_links: list[str] = [ + f"Link: {getattr(getattr(getattr(shape, 'click_action', None), 'hyperlink', None), 'address', None)}" + for slide in presentation.slides + for shape in slide.shapes + if not isinstance(shape, GroupShape) + and getattr(getattr(getattr(shape, "click_action", None), "hyperlink", None), "address", None) + ] + + cp = presentation.core_properties + expected_metadata_lines: list[str] = [] + for key in ["author", "title", "subject", "keywords", "category", "created", "modified"]: + value = getattr(cp, key, None) + if value is None: + continue + value_str = str(value).strip() + if value_str: + expected_metadata_lines.append(f"{key}: {value_str}") + + document = await DocumentMeta.from_local_path(pptx_path).fetch() + + parser_with = PptxDocumentParser() + elements_with = await parser_with.parse(document) + text_with = "\n".join(e.content for e in elements_with if isinstance(e, TextElement)) + images_with = [e for e in elements_with if not isinstance(e, TextElement)] + + parser_without = PptxDocumentParser(pptx_callbacks=[]) + elements_without = await parser_without.parse(document) + text_without = "\n".join(e.content for e in elements_without if isinstance(e, TextElement)) + images_without = [e for e in elements_without if not isinstance(e, TextElement)] + + assert len(images_with) == len(images_without) + assert len(text_with) >= len(text_without) + + normalized_with = _normalize_whitespace(text_with) + for note in expected_notes: + assert _normalize_whitespace(note) in normalized_with + + for link_text in expected_links: + assert link_text in text_with + assert link_text not in text_without + + for meta_line in expected_metadata_lines: + if meta_line in text_with: + assert meta_line not in text_without