Skip to content

Commit 2596a48

Browse files
committed
chore: update pyproject.toml and ingestion parsers
- Reformatted authors and dependencies in pyproject.toml for consistency. - Added PptxDocumentParser to the list of exported components in the ingestion parsers. - Updated the router to use PptxDocumentParser for PPTX document types.
1 parent f2c65e0 commit 2596a48

File tree

7 files changed

+348
-10
lines changed

7 files changed

+348
-10
lines changed

packages/ragbits-document-search/pyproject.toml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
55
readme = "README.md"
66
requires-python = ">=3.10"
77
license = "MIT"
8-
authors = [
9-
{ name = "deepsense.ai", email = "[email protected]"}
10-
]
8+
authors = [{ name = "deepsense.ai", email = "[email protected]" }]
119
keywords = [
1210
"Retrieval Augmented Generation",
1311
"RAG",
1412
"Large Language Models",
1513
"LLMs",
1614
"Generative AI",
1715
"GenAI",
18-
"Document Search"
16+
"Document Search",
1917
]
2018
classifiers = [
2119
"Development Status :: 4 - Beta",
@@ -31,7 +29,14 @@ classifiers = [
3129
"Topic :: Scientific/Engineering :: Artificial Intelligence",
3230
"Topic :: Software Development :: Libraries :: Python Modules",
3331
]
34-
dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.1.0"]
32+
dependencies = [
33+
"docling>=2.15.1,<3.0.0",
34+
"opencv-python>=4.11.0.86,<5.0.0.0",
35+
"rerankers>=0.6.1,<1.0.0",
36+
"filetype>=1.2.0,<2.0.0",
37+
"ragbits-core==1.1.0",
38+
"python-pptx>=1.0.0,<2.0.0",
39+
]
3540

3641
[project.urls]
3742
"Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -44,9 +49,7 @@ unstructured = [
4449
"unstructured>=0.16.9,<1.0.0",
4550
"unstructured-client>=0.26.0,<1.0.0",
4651
]
47-
ray = [
48-
"ray[data]>=2.43.0,<3.0.0",
49-
]
52+
ray = ["ray[data]>=2.43.0,<3.0.0"]
5053

5154
[tool.uv]
5255
dev-dependencies = [
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2+
from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
23
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
34

4-
__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
5+
__all__ = [
6+
"DocumentParser",
7+
"DocumentParserRouter",
8+
"ImageDocumentParser",
9+
"PptxDocumentParser",
10+
"TextDocumentParser",
11+
]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .parser import PptxDocumentParser
2+
3+
__all__ = [
4+
"PptxDocumentParser",
5+
]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from .extractors import (
2+
DEFAULT_EXTRACTORS,
3+
BasePptxExtractor,
4+
HyperlinkExtractor,
5+
ImageExtractor,
6+
MetadataExtractor,
7+
ShapeExtractor,
8+
SpeakerNotesExtractor,
9+
TextExtractor,
10+
)
11+
12+
__all__ = [
13+
"DEFAULT_EXTRACTORS",
14+
"BasePptxExtractor",
15+
"PptxHyperlinkExtractor",
16+
"PptxImageExtractor",
17+
"PptxMetadataExtractor",
18+
"PptxShapeExtractor",
19+
"PptxSpeakerNotesExtractor",
20+
"PptxTextExtractor",
21+
]
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
from __future__ import annotations
2+
3+
from abc import ABC, abstractmethod
4+
from typing import Callable, Any
5+
6+
from pptx.presentation import Presentation
7+
from pptx.slide import Slide
8+
from pptx.shapes.base import BaseShape
9+
10+
from ragbits.document_search.documents.document import DocumentMeta
11+
from ragbits.document_search.documents.element import ElementLocation, TextElement
12+
13+
14+
class BasePptxExtractor(ABC):
15+
"""Base class for all PPTX content extractors."""
16+
17+
def _get_slides(self, presentation: Presentation, slide: Slide | None = None) -> list[tuple[int, Slide]]:
18+
"""Get slides with their indices."""
19+
slides = [slide] if slide else list(presentation.slides)
20+
return list(enumerate(slides, start=1))
21+
22+
def _create_text_element(
23+
self,
24+
element_type: str,
25+
document_meta: DocumentMeta,
26+
content: str,
27+
slide_idx: int,
28+
shape: BaseShape | None = None,
29+
coordinates: dict[str, Any] | None = None
30+
) -> TextElement:
31+
"""Create a TextElement with standardized location."""
32+
if coordinates is None and shape is not None:
33+
coordinates = {
34+
"left": shape.left,
35+
"top": shape.top,
36+
"width": shape.width,
37+
"height": shape.height
38+
}
39+
40+
location = ElementLocation(
41+
page_number=slide_idx,
42+
coordinates=coordinates or {}
43+
)
44+
45+
return TextElement(
46+
element_type=element_type,
47+
document_meta=document_meta,
48+
location=location,
49+
content=content
50+
)
51+
52+
def _extract_from_shapes(
53+
self,
54+
presentation: Presentation,
55+
document_meta: DocumentMeta,
56+
slide: Slide | None,
57+
shape_filter: Callable[[BaseShape], bool],
58+
content_extractor: Callable[[BaseShape], str],
59+
element_type: str = "text"
60+
) -> list[TextElement]:
61+
"""Generic method to extract content from shapes based on filter and extractor."""
62+
elements: list[TextElement] = []
63+
64+
for slide_idx, sld in self._get_slides(presentation, slide):
65+
for shape in sld.shapes:
66+
if shape_filter(shape):
67+
try:
68+
content = content_extractor(shape)
69+
if content.strip():
70+
element = self._create_text_element(
71+
element_type=element_type,
72+
document_meta=document_meta,
73+
content=content,
74+
slide_idx=slide_idx,
75+
shape=shape
76+
)
77+
elements.append(element)
78+
except (AttributeError, TypeError):
79+
continue
80+
81+
return elements
82+
83+
@abstractmethod
84+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
85+
"""Extract content from the presentation or specific slide."""
86+
87+
@abstractmethod
88+
def get_extractor_name(self) -> str:
89+
"""Get the name of this extractor."""
90+
91+
92+
class PptxTextExtractor(BasePptxExtractor):
93+
"""Extracts text content from text frames."""
94+
95+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
96+
"""Extract text content from the presentation or a specific slide."""
97+
return self._extract_from_shapes(
98+
presentation=presentation,
99+
document_meta=document_meta,
100+
slide=slide,
101+
shape_filter=lambda shape: shape.has_text_frame,
102+
content_extractor=lambda shape: str(shape.text_frame.text).strip()
103+
)
104+
105+
def get_extractor_name(self) -> str:
106+
"""Get the name of this extractor."""
107+
return "pptx_text_extractor"
108+
109+
110+
class PptxHyperlinkExtractor(BasePptxExtractor):
111+
"""Extracts hyperlink addresses from shapes."""
112+
113+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
114+
"""Extract hyperlink content from the presentation or a specific slide."""
115+
return self._extract_from_shapes(
116+
presentation=presentation,
117+
document_meta=document_meta,
118+
slide=slide,
119+
shape_filter=lambda shape: hasattr(shape, 'click_action') and shape.click_action.hyperlink.address,
120+
content_extractor=lambda shape: shape.click_action.hyperlink.address,
121+
element_type="hyperlink"
122+
)
123+
124+
def get_extractor_name(self) -> str:
125+
"""Get the name of this extractor."""
126+
return "pptx_hyperlink_extractor"
127+
128+
129+
class PptxImageExtractor(BasePptxExtractor):
130+
"""Extracts image information from shapes."""
131+
132+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
133+
"""Extract image content from the presentation or a specific slide."""
134+
return self._extract_from_shapes(
135+
presentation=presentation,
136+
document_meta=document_meta,
137+
slide=slide,
138+
shape_filter=lambda shape: shape.image and shape.image is not None,
139+
content_extractor=lambda shape: f"Image: {shape.image.filename if hasattr(shape.image, 'filename') else 'embedded_image'}",
140+
element_type="image"
141+
)
142+
143+
def get_extractor_name(self) -> str:
144+
"""Get the name of this extractor."""
145+
return "pptx_image_extractor"
146+
147+
148+
class PptxShapeExtractor(BasePptxExtractor):
149+
"""Extracts shape information and metadata."""
150+
151+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
152+
"""Extract shape metadata from the presentation or a specific slide."""
153+
return self._extract_from_shapes(
154+
presentation=presentation,
155+
document_meta=document_meta,
156+
slide=slide,
157+
shape_filter=lambda shape: hasattr(shape, 'shape_type'),
158+
content_extractor=lambda shape: f"Shape: {shape.shape_type}",
159+
element_type="shape"
160+
)
161+
162+
def get_extractor_name(self) -> str:
163+
"""Get the name of this extractor."""
164+
return "pptx_shape_extractor"
165+
166+
167+
class PptxMetadataExtractor(BasePptxExtractor):
168+
"""Extracts document metadata."""
169+
170+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
171+
"""Extract metadata from the presentation."""
172+
core_properties = presentation.core_properties
173+
properties = [
174+
("author", core_properties.author),
175+
("title", core_properties.title),
176+
("subject", core_properties.subject),
177+
("keywords", core_properties.keywords),
178+
("category", core_properties.category),
179+
("created", str(core_properties.created) if core_properties.created else None),
180+
("modified", str(core_properties.modified) if core_properties.modified else None),
181+
]
182+
183+
elements = []
184+
for prop_name, prop_value in properties:
185+
if prop_value is not None and str(prop_value).strip():
186+
element = self._create_text_element(
187+
element_type="metadata",
188+
document_meta=document_meta,
189+
content=f"{prop_name}: {prop_value}",
190+
slide_idx=0
191+
)
192+
elements.append(element)
193+
194+
return elements
195+
196+
def get_extractor_name(self) -> str:
197+
"""Get the name of this extractor."""
198+
return "pptx_metadata_extractor"
199+
200+
201+
class PptxSpeakerNotesExtractor(BasePptxExtractor):
202+
"""Extracts speaker notes from slides."""
203+
204+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
205+
"""Extract speaker notes from the presentation or a specific slide."""
206+
elements: list[TextElement] = []
207+
208+
for slide_idx, sld in self._get_slides(presentation, slide):
209+
if sld.has_notes_slide and sld.notes_slide.notes_text_frame is not None:
210+
notes_slide = sld.notes_slide
211+
notes_text_frame = notes_slide.notes_text_frame
212+
text = notes_text_frame.text.strip() if notes_text_frame is not None else None
213+
214+
if text and notes_text_frame is not None:
215+
coordinates = {
216+
"left": notes_text_frame.margin_left,
217+
"right": notes_text_frame.margin_right,
218+
"top": notes_text_frame.margin_top,
219+
"bottom": notes_text_frame.margin_bottom
220+
}
221+
222+
element = self._create_text_element(
223+
element_type="speaker_notes",
224+
document_meta=document_meta,
225+
content=text,
226+
slide_idx=slide_idx,
227+
coordinates=coordinates
228+
)
229+
elements.append(element)
230+
231+
return elements
232+
233+
def get_extractor_name(self) -> str:
234+
"""Get the name of this extractor."""
235+
return "pptx_speaker_notes_extractor"
236+
237+
238+
DEFAULT_EXTRACTORS = [
239+
PptxTextExtractor(),
240+
PptxHyperlinkExtractor(),
241+
PptxImageExtractor(),
242+
PptxShapeExtractor(),
243+
PptxSpeakerNotesExtractor(),
244+
PptxMetadataExtractor(),
245+
]
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
5+
from pptx import Presentation
6+
7+
from ragbits.document_search.documents.document import Document, DocumentType
8+
from ragbits.document_search.documents.element import Element
9+
from ragbits.document_search.ingestion.parsers.base import DocumentParser
10+
from ragbits.document_search.ingestion.parsers.pptx.extractors import (
11+
DEFAULT_EXTRACTORS,
12+
BaseExtractor,
13+
)
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
class PptxDocumentParser(DocumentParser):
19+
"""
20+
A comprehensive PPTX parser using python-pptx library with modular extractor architecture.
21+
"""
22+
23+
supported_document_types = {DocumentType.PPTX}
24+
25+
def __init__(
26+
self,
27+
extractors: list[BaseExtractor] | None = None,
28+
) -> None:
29+
"""
30+
Initialize the PPTX parser with configurable extractors.
31+
32+
Args:
33+
extractors: List of extractors to use. If None, uses DEFAULT_EXTRACTORS.
34+
"""
35+
self.extractors = extractors or DEFAULT_EXTRACTORS
36+
37+
async def parse(self, document: Document) -> list[Element]:
38+
"""
39+
Parse the PPTX document and return extracted elements.
40+
41+
Args:
42+
document: The document to parse.
43+
44+
Returns:
45+
List of extracted elements.
46+
"""
47+
self.validate_document_type(document.metadata.document_type)
48+
49+
extracted_elements = []
50+
presentation = Presentation(document.local_path.as_posix())
51+
52+
for extractor in self.extractors:
53+
for slide in presentation.slides:
54+
extracted_elements.extend(extractor.extract(presentation, slide))
55+
56+
return extracted_elements

0 commit comments

Comments
 (0)