feat: add PPTX document parser and update dependencies

maxpill · maxpill · commit 2be019b12718 · 2025-07-04T10:52:57.000+02:00
- Introduced PptxDocumentParser for extracting content from PPTX files.
- Updated pyproject.toml to include python-pptx as a dependency.
- Modified the router to use PptxDocumentParser for PPTX document types.
- Cleaned up formatting in pyproject.toml and __init__.py for consistency.
diff --git a/packages/ragbits-document-search/pyproject.toml b/packages/ragbits-document-search/pyproject.toml
@@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
-authors = [
-    { name = "deepsense.ai", email = "ragbits@deepsense.ai"}
-]
+authors = [{ name = "deepsense.ai", email = "ragbits@deepsense.ai" }]
 keywords = [
     "Retrieval Augmented Generation",
     "RAG",
     "Large Language Models",
     "LLMs",
     "Generative AI",
     "GenAI",
-    "Document Search"
+    "Document Search",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -31,7 +29,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.0.0"]
+dependencies = [
+    "docling>=2.15.1,<3.0.0",
+    "opencv-python>=4.11.0.86,<5.0.0.0",
+    "rerankers>=0.6.1,<1.0.0",
+    "filetype>=1.2.0,<2.0.0",
+    "ragbits-core==1.0.0",
+    "python-pptx>=0.6.23,<1.0.0",
+]
 
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -44,9 +49,7 @@ unstructured = [
     "unstructured>=0.16.9,<1.0.0",
     "unstructured-client>=0.26.0,<1.0.0",
 ]
-ray = [
-    "ray[data]>=2.43.0,<3.0.0",
-]
+ray = ["ray[data]>=2.43.0,<3.0.0"]
 
 [tool.uv]
 dev-dependencies = [
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py
@@ -1,4 +1,11 @@
 from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
+from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
 from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
 
-__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
+__all__ = [
+    "DocumentParser",
+    "DocumentParserRouter",
+    "ImageDocumentParser",
+    "PptxDocumentParser",
+    "TextDocumentParser",
+]
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+
+from ragbits.document_search.documents.document import Document, DocumentType
+from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
+from ragbits.document_search.ingestion.parsers.base import DocumentParser
+
+
+class PptxDocumentParser(DocumentParser):
+    """Parser that extracts content from PPTX files using *python-pptx*.
+
+    The parser retrieves text from all textual shapes, table cells and slide notes, as well as
+    the binary bytes of pictures embedded in the presentation. Each piece of data is converted
+    into a corresponding :class:`~ragbits.document_search.documents.element.TextElement` or
+    :class:`~ragbits.document_search.documents.element.ImageElement`.
+    """
+
+    supported_document_types = {DocumentType.PPTX}
+
+    async def parse(self, document: Document) -> list[Element]:
+        """Parse the given PPTX document.
+
+        Args:
+            document: The document to parse.
+
+        Returns:
+            A list of extracted elements.
+        """
+        self.validate_document_type(document.metadata.document_type)
+        presentation = Presentation(str(document.local_path))
+        elements: list[Element] = []
+
+        for slide_idx, slide in enumerate(presentation.slides, start=1):
+            slide_location = ElementLocation(page_number=slide_idx)
+
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    text = shape.text
+                    if text and text.strip():
+                        elements.append(
+                            TextElement(
+                                document_meta=document.metadata,
+                                location=slide_location,
+                                content=text.strip(),
+                            )
+                        )
+                if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                    for row in shape.table.rows:
+                        for cell in row.cells:
+                            cell_text = cell.text
+                            if cell_text and cell_text.strip():
+                                elements.append(
+                                    TextElement(
+                                        document_meta=document.metadata,
+                                        location=slide_location,
+                                        content=cell_text.strip(),
+                                    )
+                                )
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    image_bytes = shape.image.blob
+                    description = getattr(shape, "alt_text", None) or None
+                    elements.append(
+                        ImageElement(
+                            document_meta=document.metadata,
+                            location=slide_location,
+                            image_bytes=image_bytes,
+                            description=description,
+                        )
+                    )
+
+            if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
+                notes_text = slide.notes_slide.notes_text_frame.text
+                if notes_text and notes_text.strip():
+                    elements.append(
+                        TextElement(
+                            document_meta=document.metadata,
+                            location=slide_location,
+                            content=notes_text.strip(),
+                        )
+                    )
+
+        return elements
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py
@@ -8,6 +8,7 @@
 from ragbits.document_search.ingestion.parsers.base import DocumentParser
 from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
 from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError
+from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
 
 _default_parser = DoclingDocumentParser()
 
@@ -16,7 +17,7 @@
     DocumentType.MD: _default_parser,
     DocumentType.PDF: _default_parser,
     DocumentType.DOCX: _default_parser,
-    DocumentType.PPTX: _default_parser,
+    DocumentType.PPTX: PptxDocumentParser(),
     DocumentType.XLSX: _default_parser,
     DocumentType.HTML: _default_parser,
     DocumentType.JPG: _default_parser,