deepsense-ai · mhordynski · Sep 4, 2025 · Jul 11, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/examples/document-search/test_pptx_parser.py b/examples/document-search/test_pptx_parser.py
@@ -0,0 +1,102 @@
+# This is a temporary script for development purposes and PR testing.
+# It will be removed before merging.
+
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+from typing import cast
+
+from pptx import Presentation
+from pptx.util import Inches
+from pptx.shapes.autoshape import Shape
+
+from ragbits.core.sources.local import LocalFileSource
+from ragbits.document_search.documents.document import Document, DocumentMeta, DocumentType
+from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
+
+
+async def create_dummy_pptx(file_path: str):
+    """Creates a dummy PPTX file for testing."""
+    prs = Presentation()
+
+    # Slide 1: Title Slide
+    title_slide_layout = prs.slide_layouts[0]
+    slide1 = prs.slides.add_slide(title_slide_layout)
+    title = slide1.shapes.title
+    subtitle = slide1.placeholders[1]
+    if title and title.has_text_frame:
+        title.text_frame.text = "Test Presentation"
+    if subtitle and subtitle.has_text_frame:
+        shape = cast(Shape, subtitle)
+        shape.text_frame.text = "A presentation for testing the PPTX parser."
+
+    # Slide 2: Text, Shape, and Hyperlink
+    bullet_slide_layout = prs.slide_layouts[1]
+    slide2 = prs.slides.add_slide(bullet_slide_layout)
+    shapes = slide2.shapes
+    title_shape = shapes.title
+    if title_shape and title_shape.has_text_frame:
+        title_shape.text_frame.text = "This is a slide with text, a shape, and a hyperlink."
+
+    body_shape = shapes.placeholders[1]
+    if body_shape and body_shape.has_text_frame:
+        tf = cast(Shape, body_shape).text_frame
+        tf.text = "This is a bullet point."
+
+        p = tf.add_paragraph()
+        p.text = "This is a line with a "
+        r = p.add_run()
+        r.text = "hyperlink"
+        if r.hyperlink:
+            r.hyperlink.address = "https://www.google.com"
+
+    # Slide 3: Image
+    img_slide_layout = prs.slide_layouts[5]
+    slide3 = prs.slides.add_slide(img_slide_layout)
+    img_path = "packages/ragbits-core/tests/assets/img/test.png"
+    if os.path.exists(img_path):
+        left = top = Inches(1)
+        slide3.shapes.add_picture(img_path, left, top)
+
+    # Slide 4: With speaker notes
+    notes_slide_layout = prs.slide_layouts[1]
+    slide4 = prs.slides.add_slide(notes_slide_layout)
+    if slide4.has_notes_slide:
+        notes_slide = slide4.notes_slide
+        if notes_slide.notes_text_frame:
+            text_frame = notes_slide.notes_text_frame
+            text_frame.text = "These are speaker notes for slide 4."
+
+    prs.save(file_path)
+
+
+async def main():
+    """Main function to test the PPTX parser."""
+    pptx_file = "test_pptx.pptx"
+    await create_dummy_pptx(pptx_file)
+
+    try:
+        document_meta = DocumentMeta(
+            document_type=DocumentType.PPTX,
+            source=LocalFileSource(path=Path(pptx_file)),
+        )
+        document = Document.from_document_meta(document_meta, Path(pptx_file))
+
+        parser = PptxDocumentParser()
+        elements = await parser.parse(document)
+
+        print(f"--- Extracted {len(elements)} elements ---")
+        for element in elements:
+            print(f"Type: {element.element_type}")
+            print(f"Content: {element.text_representation}")
+            print(f"Location: {element.location}")
+            print("-" * 20)
+
+    except Exception as e:
+        print(f"Error: {e}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
diff --git a/packages/ragbits-document-search/CHANGELOG.md b/packages/ragbits-document-search/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Unreleased
 
+- feat: add pptx document parser (#693)
+
 ## 1.1.0 (2025-07-09)
 
 ### Changed
@@ -179,6 +181,7 @@
 ## 0.7.0 (2025-01-21)
 
 ### Added
+
 - Add CLI command to perform search on DocumentSearch instance (#290).
 
 ### Changed
@@ -202,7 +205,7 @@
 
 ### Added
 
-- Distributed ingestion with usage of https://www.ray.io/ (#207)
+- Distributed ingestion with usage of <https://www.ray.io/> (#207)
 - Documents can be now replaced in existing VectorStore (#210)
 
 ### Changed
@@ -228,7 +231,6 @@
 - Add location metadata to documents ingested into DocumentSearch (#122).
 - Add LiteLLM Reranker (#109).
 
-
 ### Changed
 
 - ragbits-core updated to version v0.3.0

diff --git a/packages/ragbits-document-search/pyproject.toml b/packages/ragbits-document-search/pyproject.toml
@@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
-authors = [
-    { name = "deepsense.ai", email = "[email protected]"}
-]
+authors = [{ name = "deepsense.ai", email = "[email protected]" }]
 keywords = [
     "Retrieval Augmented Generation",
     "RAG",
     "Large Language Models",
     "LLMs",
     "Generative AI",
     "GenAI",
-    "Document Search"
+    "Document Search",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -31,7 +29,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.1.0"]
+dependencies = [
+    "docling>=2.15.1,<3.0.0",
+    "opencv-python>=4.11.0.86,<5.0.0.0",
+    "rerankers>=0.6.1,<1.0.0",
+    "filetype>=1.2.0,<2.0.0",
+    "ragbits-core==1.1.0",
+    "python-pptx>=1.0.0,<2.0.0",
+]
 
 [project.urls]
 "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -44,9 +49,7 @@ unstructured = [
     "unstructured>=0.16.9,<1.0.0",
     "unstructured-client>=0.26.0,<1.0.0",
 ]
-ray = [
-    "ray[data]>=2.43.0,<3.0.0",
-]
+ray = ["ray[data]>=2.43.0,<3.0.0"]
 
 [tool.uv]
 dev-dependencies = [

diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py
@@ -1,4 +1,11 @@
 from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
+from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
 from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
 
-__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
+__all__ = [
+    "DocumentParser",
+    "DocumentParserRouter",
+    "ImageDocumentParser",
+    "PptxDocumentParser",
+    "TextDocumentParser",
+]
diff --git a/...es/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py b/...es/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py
@@ -0,0 +1,5 @@
+from .parser import PptxDocumentParser
+
+__all__ = [
+    "PptxDocumentParser",
+]
diff --git a/...document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/__init__.py b/...document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/__init__.py
@@ -0,0 +1,21 @@
+from .extractors import (
+    DEFAULT_EXTRACTORS,
+    BasePptxExtractor,
+    PptxHyperlinkExtractor,
+    PptxImageExtractor,
+    PptxMetadataExtractor,
+    PptxShapeExtractor,
+    PptxSpeakerNotesExtractor,
+    PptxTextExtractor,
+)
+
+__all__ = [
+    "DEFAULT_EXTRACTORS",
+    "BasePptxExtractor",
+    "PptxHyperlinkExtractor",
+    "PptxImageExtractor",
+    "PptxMetadataExtractor",
+    "PptxShapeExtractor",
+    "PptxSpeakerNotesExtractor",
+    "PptxTextExtractor",
+]