Skip to content

Commit 948f935

Browse files
committed
feat: add PPTX document parser and update dependencies
- Introduced PptxDocumentParser to handle PPTX files. - Updated the list of dependencies in pyproject.toml to include python-pptx. - Cleaned up formatting in pyproject.toml and __init__.py for better readability.
1 parent f2c65e0 commit 948f935

File tree

7 files changed

+317
-10
lines changed

7 files changed

+317
-10
lines changed

packages/ragbits-document-search/pyproject.toml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
55
readme = "README.md"
66
requires-python = ">=3.10"
77
license = "MIT"
8-
authors = [
9-
{ name = "deepsense.ai", email = "[email protected]"}
10-
]
8+
authors = [{ name = "deepsense.ai", email = "[email protected]" }]
119
keywords = [
1210
"Retrieval Augmented Generation",
1311
"RAG",
1412
"Large Language Models",
1513
"LLMs",
1614
"Generative AI",
1715
"GenAI",
18-
"Document Search"
16+
"Document Search",
1917
]
2018
classifiers = [
2119
"Development Status :: 4 - Beta",
@@ -31,7 +29,14 @@ classifiers = [
3129
"Topic :: Scientific/Engineering :: Artificial Intelligence",
3230
"Topic :: Software Development :: Libraries :: Python Modules",
3331
]
34-
dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.1.0"]
32+
dependencies = [
33+
"docling>=2.15.1,<3.0.0",
34+
"opencv-python>=4.11.0.86,<5.0.0.0",
35+
"rerankers>=0.6.1,<1.0.0",
36+
"filetype>=1.2.0,<2.0.0",
37+
"ragbits-core==1.1.0",
38+
"python-pptx>=1.0.0,<2.0.0",
39+
]
3540

3641
[project.urls]
3742
"Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -44,9 +49,7 @@ unstructured = [
4449
"unstructured>=0.16.9,<1.0.0",
4550
"unstructured-client>=0.26.0,<1.0.0",
4651
]
47-
ray = [
48-
"ray[data]>=2.43.0,<3.0.0",
49-
]
52+
ray = ["ray[data]>=2.43.0,<3.0.0"]
5053

5154
[tool.uv]
5255
dev-dependencies = [
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2+
from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
23
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
34

4-
__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
5+
__all__ = [
6+
"DocumentParser",
7+
"DocumentParserRouter",
8+
"ImageDocumentParser",
9+
"PptxDocumentParser",
10+
"TextDocumentParser",
11+
]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .parser import PptxDocumentParser
2+
3+
__all__ = [
4+
"PptxDocumentParser",
5+
]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from .extractors import (
2+
DEFAULT_EXTRACTORS,
3+
BasePptxExtractor,
4+
HyperlinkExtractor,
5+
ImageExtractor,
6+
MetadataExtractor,
7+
ShapeExtractor,
8+
SpeakerNotesExtractor,
9+
TextExtractor,
10+
)
11+
12+
__all__ = [
13+
"DEFAULT_EXTRACTORS",
14+
"BasePptxExtractor",
15+
"PptxHyperlinkExtractor",
16+
"PptxImageExtractor",
17+
"PptxMetadataExtractor",
18+
"PptxShapeExtractor",
19+
"PptxSpeakerNotesExtractor",
20+
"PptxTextExtractor",
21+
]
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
from __future__ import annotations
2+
3+
from abc import ABC, abstractmethod
4+
5+
from pptx.presentation import Presentation
6+
from pptx.slide import Slide
7+
8+
from ragbits.document_search.documents.document import DocumentMeta
9+
from ragbits.document_search.documents.element import ElementLocation, TextElement
10+
11+
12+
class BasePptxExtractor(ABC):
13+
"""Base class for all PPTX content extractors."""
14+
15+
@abstractmethod
16+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
17+
"""Extract content from the presentation or specific slide."""
18+
19+
@abstractmethod
20+
def get_extractor_name(self) -> str:
21+
"""Get the name of this extractor."""
22+
23+
24+
class PptxTextExtractor(BasePptxExtractor):
25+
"""Extracts text content with hierarchy, positioning, and formatting."""
26+
27+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
28+
"""Extract text content from the presentation or a specific slide."""
29+
slides = [slide] if slide else list(presentation.slides)
30+
31+
elements: list[TextElement] = []
32+
for slide_idx, sld in enumerate(slides, start=1):
33+
for shape in sld.shapes:
34+
if shape.has_text_frame:
35+
text_frame = shape.text_frame
36+
text = str(text_frame.text).strip()
37+
element = TextElement(
38+
element_type="text",
39+
document_meta=document_meta,
40+
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
41+
content=text,
42+
)
43+
elements.append(element)
44+
45+
return elements
46+
47+
def get_extractor_name(self) -> str:
48+
"""Get the name of this extractor."""
49+
return "pptx_text_extractor"
50+
51+
class PptxHyperlinkExtractor(BasePptxExtractor):
52+
"""Extracts text content with hierarchy, positioning, and formatting."""
53+
54+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
55+
"""Extract hyperlink content from the presentation or a specific slide."""
56+
slides = [slide] if slide else list(presentation.slides)
57+
58+
elements: list[TextElement] = []
59+
for slide_idx, sld in enumerate(slides, start=1):
60+
for shape in sld.shapes:
61+
if shape.click_action.hyperlink.address:
62+
shape.has_text_frame
63+
element = TextElement(
64+
element_type="text",
65+
document_meta=document_meta,
66+
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
67+
content=shape.click_action.hyperlink.address,
68+
)
69+
elements.append(element)
70+
71+
return elements
72+
73+
74+
def get_extractor_name(self) -> str:
75+
"""Get the name of this extractor."""
76+
return "pptx_hyperlink_extractor"
77+
78+
class PptxImageExtractor(BasePptxExtractor):
79+
"""Extracts text content with hierarchy, positioning, and formatting."""
80+
81+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
82+
"""Extract hyperlink content from the presentation or a specific slide."""
83+
slides = [slide] if slide else list(presentation.slides)
84+
85+
elements: list[TextElement] = []
86+
for slide_idx, sld in enumerate(slides, start=1):
87+
for shape in sld.shapes:
88+
if shape.click_action.hyperlink:
89+
text_frame = shape.text_frame
90+
text = str(text_frame.text).strip()
91+
element = TextElement(
92+
element_type="text",
93+
document_meta=document_meta,
94+
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
95+
content=text,
96+
)
97+
elements.append(element)
98+
99+
return elements
100+
101+
102+
def get_extractor_name(self) -> str:
103+
"""Get the name of this extractor."""
104+
return "pptx_image_extractor"
105+
106+
107+
class PptxShapeExtractor(BasePptxExtractor):
108+
"""Extracts text content with hierarchy, positioning, and formatting."""
109+
110+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
111+
"""Extract hyperlink content from the presentation or a specific slide."""
112+
slides = [slide] if slide else list(presentation.slides)
113+
114+
elements: list[TextElement] = []
115+
for slide_idx, sld in enumerate(slides, start=1):
116+
for shape in sld.shapes:
117+
if shape.click_action.hyperlink:
118+
text_frame = shape.text_frame
119+
text = str(text_frame.text).strip()
120+
element = TextElement(
121+
element_type="text",
122+
document_meta=document_meta,
123+
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
124+
content=text,
125+
)
126+
elements.append(element)
127+
128+
return elements
129+
130+
131+
def get_extractor_name(self) -> str:
132+
"""Get the name of this extractor."""
133+
return "pptx_shape_extractor"
134+
135+
class PptxMetadataExtractor(BasePptxExtractor):
136+
"""Extracts text content with hierarchy, positioning, and formatting."""
137+
138+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
139+
"""Extract hyperlink content from the presentation or a specific slide."""
140+
core_properties = presentation.core_properties
141+
properties = [
142+
core_properties.author,
143+
core_properties.title,
144+
core_properties.subject,
145+
core_properties.keywords,
146+
core_properties.category,
147+
core_properties.created,
148+
core_properties.modified,
149+
]
150+
151+
elements = []
152+
for prop in properties:
153+
if prop is not None:
154+
elements.append(TextElement(
155+
element_type="metadata",
156+
document_meta=document_meta,
157+
content=prop,
158+
))
159+
160+
return elements
161+
162+
163+
def get_extractor_name(self) -> str:
164+
"""Get the name of this extractor."""
165+
return "pptx_metadata_extractor"
166+
167+
class PptxSpeakerNotesExtractor(BasePptxExtractor):
168+
"""Extracts text content with hierarchy, positioning, and formatting."""
169+
170+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
171+
"""Extract hyperlink content from the presentation or a specific slide."""
172+
slides = [slide] if slide else list(presentation.slides)
173+
174+
elements: list[TextElement] = []
175+
for slide_idx, sld in enumerate(slides, start=1):
176+
if sld.has_notes_slide and sld.notes_slide.notes_text_frame is not None:
177+
notes_slide = sld.notes_slide
178+
notes_text_frame = notes_slide.notes_text_frame
179+
text = notes_text_frame.text
180+
element = TextElement(
181+
element_type="text",
182+
document_meta=document_meta,
183+
location=ElementLocation(page_number=slide_idx, coordinates={"left": notes_text_frame.margin_left, "right": notes_text_frame.margin_right, "top": notes_text_frame.margin_top, "bottom": notes_text_frame.margin_bottom}),
184+
content=text,
185+
)
186+
elements.append(element)
187+
for shape in notes_slide.shapes:
188+
if shape.has_text_frame:
189+
text_frame = shape.text_frame
190+
text = str(text_frame.text).strip()
191+
element = TextElement(
192+
element_type="text",
193+
document_meta=document_meta,
194+
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
195+
content=text,
196+
)
197+
elements.append(element)
198+
199+
return elements
200+
201+
202+
def get_extractor_name(self) -> str:
203+
"""Get the name of this extractor."""
204+
return "pptx_speaker_notes_extractor"
205+
206+
DEFAULT_EXTRACTORS = [
207+
PptxTextExtractor(),
208+
PptxHyperlinkExtractor(),
209+
PptxImageExtractor(),
210+
PptxShapeExtractor(),
211+
PptxSpeakerNotesExtractor(),
212+
PptxMetadataExtractor(),
213+
# PptxSlideImageExtractor(),
214+
]
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
5+
from pptx import Presentation
6+
7+
from ragbits.document_search.documents.document import Document, DocumentType
8+
from ragbits.document_search.documents.element import Element
9+
from ragbits.document_search.ingestion.parsers.base import DocumentParser
10+
from ragbits.document_search.ingestion.parsers.pptx.extractors import (
11+
DEFAULT_EXTRACTORS,
12+
BaseExtractor,
13+
)
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class PptxDocumentParser(DocumentParser):
19+
"""
20+
A comprehensive PPTX parser using python-pptx library with modular extractor architecture.
21+
"""
22+
23+
supported_document_types = {DocumentType.PPTX}
24+
25+
def __init__(
26+
self,
27+
extractors: list[BaseExtractor] | None = None,
28+
) -> None:
29+
"""
30+
Initialize the PPTX parser with configurable extractors.
31+
32+
Args:
33+
extractors: List of extractors to use. If None, uses DEFAULT_EXTRACTORS.
34+
"""
35+
self.extractors = extractors or DEFAULT_EXTRACTORS
36+
37+
async def parse(self, document: Document) -> list[Element]:
38+
"""
39+
Parse the PPTX document and return extracted elements.
40+
41+
Args:
42+
document: The document to parse.
43+
44+
Returns:
45+
List of extracted elements.
46+
"""
47+
self.validate_document_type(document.metadata.document_type)
48+
49+
extracted_elements = []
50+
presentation = Presentation(document.local_path.as_posix())
51+
52+
for extractor in self.extractors:
53+
for slide in presentation.slides:
54+
extracted_elements.extend(extractor.extract(presentation, slide))
55+
56+
return extracted_elements

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from ragbits.document_search.ingestion.parsers.base import DocumentParser
99
from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
1010
from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError
11+
from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
1112

1213
_default_parser = DoclingDocumentParser()
1314

@@ -16,7 +17,7 @@
1617
DocumentType.MD: _default_parser,
1718
DocumentType.PDF: _default_parser,
1819
DocumentType.DOCX: _default_parser,
19-
DocumentType.PPTX: _default_parser,
20+
DocumentType.PPTX: PptxDocumentParser(),
2021
DocumentType.XLSX: _default_parser,
2122
DocumentType.HTML: _default_parser,
2223
DocumentType.JPG: _default_parser,

0 commit comments

Comments
 (0)