Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
2596a48
chore: update pyproject.toml and ingestion parsers
maxpill Jul 11, 2025
6514cfa
Merge branch 'main' into 687-feat-pptx-parser
maxpill Jul 15, 2025
30b0806
Merge branch '687-feat-pptx-parser' of https://github.com/deepsense-a…
maxpill Jul 15, 2025
00ad9dc
feat(pptx): add temporary testing script and enhance PPTX parser
maxpill Jul 15, 2025
55fee9c
fix(test): cast shapes in PPTX parser test for correct type handling
maxpill Jul 15, 2025
09f56cf
Merge branch 'main' into 687-feat-pptx-parser
maxpill Jul 15, 2025
2c4c5ee
feat: add pptx document parser to changelog
maxpill Jul 15, 2025
fd5a150
Merge branch '687-feat-pptx-parser' of https://github.com/deepsense-a…
maxpill Jul 15, 2025
d5a3e28
Apply suggestions from code review
maxpill Jul 15, 2025
c7fa386
Merge branch '687-feat-pptx-parser' of https://github.com/deepsense-a…
maxpill Jul 16, 2025
da462ae
application/pdf export to mime map
pocucan-ds Jul 16, 2025
5f6b8a6
impersonation feature for client.
pocucan-ds Jul 16, 2025
310968d
feat(pptx): implement error handling and logging in PPTX parser
maxpill Jul 17, 2025
426a6ba
chore: remove temporary PPTX parser test script
maxpill Jul 17, 2025
ea41ed3
refactor(pptx): convert extractor methods to static methods
maxpill Jul 17, 2025
38ffe8b
remove impersonation at class level and define it at instance level.
pocucan-ds Jul 18, 2025
466fe37
updated tests
pocucan-ds Jul 18, 2025
5dd7fe4
impersonation same loop
pocucan-ds Jul 18, 2025
238f8fd
updated environment variables.
pocucan-ds Jul 18, 2025
4524989
impersonation support changelog
pocucan-ds Jul 18, 2025
f3e5255
Merge branch 'main' into extension/source/googledrive/impersonator
pocucan-ds Jul 18, 2025
6a65354
updated how to and formatted.
pocucan-ds Jul 18, 2025
4041158
Merge branch 'extension/source/googledrive/impersonator' of https://g…
pocucan-ds Jul 18, 2025
20df7d2
updated for ruff
pocucan-ds Jul 18, 2025
54da82b
updated signature
pocucan-ds Jul 18, 2025
a42909e
Merge branch 'main' of https://github.com/deepsense-ai/ragbits into 6…
maxpill Jul 21, 2025
da272a8
Add impersonation attributes to GoogleDriveSource and update tests fo…
maxpill Jul 21, 2025
7f095c1
Merge branch 'main' into feat/gdrive-impersonator
maxpill Jul 21, 2025
b6efc38
Merge branch 'feat/gdrive-impersonator' of https://github.com/deepsen…
maxpill Jul 21, 2025
55c3f34
Add GoogleDriveExportFormat enum and update MIME type handling in Goo…
maxpill Jul 24, 2025
6932ccf
Merge branch 'feat/gdrive-impersonator' of https://github.com/deepsen…
maxpill Jul 24, 2025
3cc602a
Merge branch 'main' into 687-feat-pptx-parser
pocucan-ds Jul 25, 2025
7ab8b52
Merge branch '687-feat-pptx-parser' of https://github.com/deepsense-a…
maxpill Jul 25, 2025
1ca84ef
refactor(pptx): streamline extraction process and enhance element cre…
maxpill Aug 1, 2025
18cd706
Merge branch '687-feat-pptx-parser' of https://github.com/deepsense-a…
maxpill Aug 7, 2025
aa16967
Merge branch 'develop' of https://github.com/deepsense-ai/ragbits int…
maxpill Aug 11, 2025
797a26e
refactor: update PPTX extractor classes to use properties for extract…
maxpill Aug 12, 2025
7b730e6
feat(pptx): implement callback architecture for PPTX document parsing
maxpill Aug 12, 2025
aec57c4
test(pptx): add integration tests for PPTX document parser
maxpill Aug 12, 2025
a1ed241
Merge branch 'develop' into 687-feat-pptx-parser
mhordynski Sep 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions examples/document-search/test_pptx_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# This is a temporary script for development purposes and PR testing.
# It will be removed before merging.

from __future__ import annotations

import asyncio
import os
from pathlib import Path
from typing import cast

from pptx import Presentation
from pptx.util import Inches
from pptx.shapes.autoshape import Shape

from ragbits.core.sources.local import LocalFileSource
from ragbits.document_search.documents.document import Document, DocumentMeta, DocumentType
from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser


async def create_dummy_pptx(file_path: str):
"""Creates a dummy PPTX file for testing."""
prs = Presentation()

# Slide 1: Title Slide
title_slide_layout = prs.slide_layouts[0]
slide1 = prs.slides.add_slide(title_slide_layout)
title = slide1.shapes.title
subtitle = slide1.placeholders[1]
if title and title.has_text_frame:
title.text_frame.text = "Test Presentation"
if subtitle and subtitle.has_text_frame:
shape = cast(Shape, subtitle)
shape.text_frame.text = "A presentation for testing the PPTX parser."

# Slide 2: Text, Shape, and Hyperlink
bullet_slide_layout = prs.slide_layouts[1]
slide2 = prs.slides.add_slide(bullet_slide_layout)
shapes = slide2.shapes
title_shape = shapes.title
if title_shape and title_shape.has_text_frame:
title_shape.text_frame.text = "This is a slide with text, a shape, and a hyperlink."

body_shape = shapes.placeholders[1]
if body_shape and body_shape.has_text_frame:
tf = cast(Shape, body_shape).text_frame
tf.text = "This is a bullet point."

p = tf.add_paragraph()
p.text = "This is a line with a "
r = p.add_run()
r.text = "hyperlink"
if r.hyperlink:
r.hyperlink.address = "https://www.google.com"

# Slide 3: Image
img_slide_layout = prs.slide_layouts[5]
slide3 = prs.slides.add_slide(img_slide_layout)
img_path = "packages/ragbits-core/tests/assets/img/test.png"
if os.path.exists(img_path):
left = top = Inches(1)
slide3.shapes.add_picture(img_path, left, top)

# Slide 4: With speaker notes
notes_slide_layout = prs.slide_layouts[1]
slide4 = prs.slides.add_slide(notes_slide_layout)
if slide4.has_notes_slide:
notes_slide = slide4.notes_slide
if notes_slide.notes_text_frame:
text_frame = notes_slide.notes_text_frame
text_frame.text = "These are speaker notes for slide 4."

prs.save(file_path)


async def main():
"""Main function to test the PPTX parser."""
pptx_file = "test_pptx.pptx"
await create_dummy_pptx(pptx_file)

try:
document_meta = DocumentMeta(
document_type=DocumentType.PPTX,
source=LocalFileSource(path=Path(pptx_file)),
)
document = Document.from_document_meta(document_meta, Path(pptx_file))

parser = PptxDocumentParser()
elements = await parser.parse(document)

print(f"--- Extracted {len(elements)} elements ---")
for element in elements:
print(f"Type: {element.element_type}")
print(f"Content: {element.text_representation}")
print(f"Location: {element.location}")
print("-" * 20)

except Exception as e:
print(f"Error: {e}")


if __name__ == "__main__":
asyncio.run(main())
6 changes: 4 additions & 2 deletions packages/ragbits-document-search/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## Unreleased

- feat: add pptx document parser (#693)

## 1.1.0 (2025-07-09)

### Changed
Expand Down Expand Up @@ -179,6 +181,7 @@
## 0.7.0 (2025-01-21)

### Added

- Add CLI command to perform search on DocumentSearch instance (#290).

### Changed
Expand All @@ -202,7 +205,7 @@

### Added

- Distributed ingestion with usage of https://www.ray.io/ (#207)
- Distributed ingestion with usage of <https://www.ray.io/> (#207)
- Documents can be now replaced in existing VectorStore (#210)

### Changed
Expand All @@ -228,7 +231,6 @@
- Add location metadata to documents ingested into DocumentSearch (#122).
- Add LiteLLM Reranker (#109).


### Changed

- ragbits-core updated to version v0.3.0
Expand Down
19 changes: 11 additions & 8 deletions packages/ragbits-document-search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
authors = [
{ name = "deepsense.ai", email = "[email protected]"}
]
authors = [{ name = "deepsense.ai", email = "[email protected]" }]
keywords = [
"Retrieval Augmented Generation",
"RAG",
"Large Language Models",
"LLMs",
"Generative AI",
"GenAI",
"Document Search"
"Document Search",
]
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -31,7 +29,14 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.1.0"]
dependencies = [
"docling>=2.15.1,<3.0.0",
"opencv-python>=4.11.0.86,<5.0.0.0",
"rerankers>=0.6.1,<1.0.0",
"filetype>=1.2.0,<2.0.0",
"ragbits-core==1.1.0",
"python-pptx>=1.0.0,<2.0.0",
]

[project.urls]
"Homepage" = "https://github.com/deepsense-ai/ragbits"
Expand All @@ -44,9 +49,7 @@ unstructured = [
"unstructured>=0.16.9,<1.0.0",
"unstructured-client>=0.26.0,<1.0.0",
]
ray = [
"ray[data]>=2.43.0,<3.0.0",
]
ray = ["ray[data]>=2.43.0,<3.0.0"]

[tool.uv]
dev-dependencies = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter

__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
__all__ = [
"DocumentParser",
"DocumentParserRouter",
"ImageDocumentParser",
"PptxDocumentParser",
"TextDocumentParser",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .parser import PptxDocumentParser

__all__ = [
"PptxDocumentParser",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .extractors import (
DEFAULT_EXTRACTORS,
BasePptxExtractor,
PptxHyperlinkExtractor,
PptxImageExtractor,
PptxMetadataExtractor,
PptxShapeExtractor,
PptxSpeakerNotesExtractor,
PptxTextExtractor,
)

__all__ = [
"DEFAULT_EXTRACTORS",
"BasePptxExtractor",
"PptxHyperlinkExtractor",
"PptxImageExtractor",
"PptxMetadataExtractor",
"PptxShapeExtractor",
"PptxSpeakerNotesExtractor",
"PptxTextExtractor",
]
Loading
Loading