feat(pptx): integrate python-pptx library for enhanced PPTX parsing

maxpill · maxpill · commit 55380409dae4 · 2025-07-11T11:13:10.000+02:00
- Added python-pptx as a dependency for improved PPTX file handling.
- Updated the parser import path to reflect the new module structure.
- Removed the obsolete pptx.py file to streamline the codebase.
diff --git a/packages/ragbits-document-search/pyproject.toml b/packages/ragbits-document-search/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "rerankers>=0.6.1,<1.0.0",
     "filetype>=1.2.0,<2.0.0",
     "ragbits-core==1.1.0",
+    "python-pptx>=1.0.0,<2.0.0",
 ]
 
 [project.urls]
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py
@@ -1,5 +1,5 @@
 from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
-from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
+from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
 from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
 
 __all__ = [
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/dataclasses.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/dataclasses.py
@@ -0,0 +1,90 @@
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass
+class PptxTextContent:
+    """
+    Represents extracted text content with formatting information.
+    """
+
+    text: str
+    font_name: str | None = None
+    font_size: float | None = None
+    is_bold: bool = False
+    is_italic: bool = False
+    is_underlined: bool = False
+    color: str | None = None
+    hyperlink_url: str | None = None
+    hierarchy_level: int = 0
+
+
+@dataclass
+class PptxShapeInfo:
+    """
+    Represents extracted shape information.
+    """
+
+    shape_type: str
+    name: str
+    left: float
+    top: float
+    width: float
+    height: float
+    rotation: float
+    fill_color: str | None = None
+    line_color: str | None = None
+    text_content: str | None = None
+    is_grouped: bool = False
+
+
+@dataclass
+class PptxImageInfo:
+    """
+    Represents extracted image information.
+    """
+
+    image_data: bytes
+    format: str
+    width: int
+    height: int
+    left: float
+    top: float
+    description: str | None = None
+    ocr_text: str | None = None
+
+
+@dataclass
+class PptxSlideInfo:
+    """
+    Represents comprehensive slide information.
+    """
+
+    slide_number: int
+    slide_image: bytes
+    layout_type: str
+    title: str | None = None
+    content: str | None = None
+    speaker_notes: str | None = None
+    shapes: list[PptxShapeInfo] | None = None
+    images: list[PptxImageInfo] | None = None
+
+
+@dataclass
+class PptxMetadata:
+    """
+    Represents comprehensive document metadata.
+    """
+
+    title: str | None = None
+    author: str | None = None
+    subject: str | None = None
+    keywords: str | None = None
+    category: str | None = None
+    comments: str | None = None
+    created: str | None = None
+    modified: str | None = None
+    last_modified_by: str | None = None
+    revision: int | None = None
+    total_slides: int = 0
+    custom_properties: dict[str, Any] | None = None
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/pptx.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/pptx.py
@@ -1,15 +1,6 @@
-"""
-Comprehensive PPTX parser using python-pptx library.
-
-This module provides a complete solution for parsing PowerPoint files, extracting
-text, shapes, images, slides, and metadata with memory-efficient processing.
-"""
-
 import io
 import logging
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
 
 import cv2
 import numpy as np
@@ -25,101 +16,18 @@
 from ragbits.document_search.documents.document import Document, DocumentType
 from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
 from ragbits.document_search.ingestion.parsers.base import DocumentParser
+from ragbits.document_search.ingestion.parsers.pptx.dataclasses import (
+    PptxImageInfo,
+    PptxMetadata,
+    PptxShapeInfo,
+    PptxSlideInfo,
+)
 
 logger = logging.getLogger(__name__)
 
-# Constants
 MAX_TEXT_PREVIEW_LENGTH = 200
 
 
-@dataclass
-class PptxTextContent:
-    """
-    Represents extracted text content with formatting information.
-    """
-
-    text: str
-    font_name: str | None = None
-    font_size: float | None = None
-    is_bold: bool = False
-    is_italic: bool = False
-    is_underlined: bool = False
-    color: str | None = None
-    hyperlink_url: str | None = None
-    hierarchy_level: int = 0
-
-
-@dataclass
-class PptxShapeInfo:
-    """
-    Represents extracted shape information.
-    """
-
-    shape_type: str
-    name: str
-    left: float
-    top: float
-    width: float
-    height: float
-    rotation: float
-    fill_color: str | None = None
-    line_color: str | None = None
-    text_content: str | None = None
-    is_grouped: bool = False
-
-
-@dataclass
-class PptxImageInfo:
-    """
-    Represents extracted image information.
-    """
-
-    image_data: bytes
-    format: str
-    width: int
-    height: int
-    left: float
-    top: float
-    description: str | None = None
-    ocr_text: str | None = None
-
-
-@dataclass
-class PptxSlideInfo:
-    """
-    Represents comprehensive slide information.
-    """
-
-    slide_number: int
-    slide_image: bytes
-    layout_type: str
-    title: str | None = None
-    content: str | None = None
-    speaker_notes: str | None = None
-    shapes: list[PptxShapeInfo] | None = None
-    images: list[PptxImageInfo] | None = None
-
-
-@dataclass
-class PptxMetadata:
-    """
-    Represents comprehensive document metadata.
-    """
-
-    title: str | None = None
-    author: str | None = None
-    subject: str | None = None
-    keywords: str | None = None
-    category: str | None = None
-    comments: str | None = None
-    created: str | None = None
-    modified: str | None = None
-    last_modified_by: str | None = None
-    revision: int | None = None
-    total_slides: int = 0
-    custom_properties: dict[str, Any] | None = None
-
-
 class PptxDocumentParser(DocumentParser):
     """
     Comprehensive PPTX parser that extracts text, shapes, images, slides, and metadata.
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ dependencies = [`
`35`	`35`	`"rerankers>=0.6.1,<1.0.0",`
`36`	`36`	`"filetype>=1.2.0,<2.0.0",`
`37`	`37`	`"ragbits-core==1.1.0",`
	`38`	`+ "python-pptx>=1.0.0,<2.0.0",`
`38`	`39`	`]`
`39`	`40`
`40`	`41`	`[project.urls]`