Skip to content

Commit 29e0086

Browse files
committed
feat(pptx): enhance PPTX extractors with reusable methods and improved structure
- Added utility methods in BasePptxExtractor for slide retrieval and text element creation. - Refactored PptxTextExtractor, PptxHyperlinkExtractor, PptxImageExtractor, PptxShapeExtractor, PptxMetadataExtractor, and PptxSpeakerNotesExtractor to utilize these methods for cleaner extraction logic. - Updated extraction logic to handle coordinates and content more effectively. - Improved documentation for extractor classes.
1 parent 948f935 commit 29e0086

File tree

1 file changed

+161
-116
lines changed
  • packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors

1 file changed

+161
-116
lines changed
Lines changed: 161 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from __future__ import annotations
22

33
from abc import ABC, abstractmethod
4+
from typing import Callable, Any
45

56
from pptx.presentation import Presentation
67
from pptx.slide import Slide
8+
from pptx.shapes.base import BaseShape
79

810
from ragbits.document_search.documents.document import DocumentMeta
911
from ragbits.document_search.documents.element import ElementLocation, TextElement
@@ -12,6 +14,72 @@
1214
class BasePptxExtractor(ABC):
1315
"""Base class for all PPTX content extractors."""
1416

17+
def _get_slides(self, presentation: Presentation, slide: Slide | None = None) -> list[tuple[int, Slide]]:
18+
"""Get slides with their indices."""
19+
slides = [slide] if slide else list(presentation.slides)
20+
return list(enumerate(slides, start=1))
21+
22+
def _create_text_element(
23+
self,
24+
element_type: str,
25+
document_meta: DocumentMeta,
26+
content: str,
27+
slide_idx: int,
28+
shape: BaseShape | None = None,
29+
coordinates: dict[str, Any] | None = None
30+
) -> TextElement:
31+
"""Create a TextElement with standardized location."""
32+
if coordinates is None and shape is not None:
33+
coordinates = {
34+
"left": shape.left,
35+
"top": shape.top,
36+
"width": shape.width,
37+
"height": shape.height
38+
}
39+
40+
location = ElementLocation(
41+
page_number=slide_idx,
42+
coordinates=coordinates or {}
43+
)
44+
45+
return TextElement(
46+
element_type=element_type,
47+
document_meta=document_meta,
48+
location=location,
49+
content=content
50+
)
51+
52+
def _extract_from_shapes(
53+
self,
54+
presentation: Presentation,
55+
document_meta: DocumentMeta,
56+
slide: Slide | None,
57+
shape_filter: Callable[[BaseShape], bool],
58+
content_extractor: Callable[[BaseShape], str],
59+
element_type: str = "text"
60+
) -> list[TextElement]:
61+
"""Generic method to extract content from shapes based on filter and extractor."""
62+
elements: list[TextElement] = []
63+
64+
for slide_idx, sld in self._get_slides(presentation, slide):
65+
for shape in sld.shapes:
66+
if shape_filter(shape):
67+
try:
68+
content = content_extractor(shape)
69+
if content.strip():
70+
element = self._create_text_element(
71+
element_type=element_type,
72+
document_meta=document_meta,
73+
content=content,
74+
slide_idx=slide_idx,
75+
shape=shape
76+
)
77+
elements.append(element)
78+
except (AttributeError, TypeError):
79+
continue
80+
81+
return elements
82+
1583
@abstractmethod
1684
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
1785
"""Extract content from the presentation or specific slide."""
@@ -22,193 +90,170 @@ def get_extractor_name(self) -> str:
2290

2391

2492
class PptxTextExtractor(BasePptxExtractor):
25-
"""Extracts text content with hierarchy, positioning, and formatting."""
93+
"""Extracts text content from text frames."""
2694

2795
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
2896
"""Extract text content from the presentation or a specific slide."""
29-
slides = [slide] if slide else list(presentation.slides)
30-
31-
elements: list[TextElement] = []
32-
for slide_idx, sld in enumerate(slides, start=1):
33-
for shape in sld.shapes:
34-
if shape.has_text_frame:
35-
text_frame = shape.text_frame
36-
text = str(text_frame.text).strip()
37-
element = TextElement(
38-
element_type="text",
39-
document_meta=document_meta,
40-
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
41-
content=text,
42-
)
43-
elements.append(element)
44-
45-
return elements
97+
return self._extract_from_shapes(
98+
presentation=presentation,
99+
document_meta=document_meta,
100+
slide=slide,
101+
shape_filter=lambda shape: shape.has_text_frame,
102+
content_extractor=lambda shape: str(shape.text_frame.text).strip()
103+
)
46104

47105
def get_extractor_name(self) -> str:
48106
"""Get the name of this extractor."""
49107
return "pptx_text_extractor"
50108

109+
51110
class PptxHyperlinkExtractor(BasePptxExtractor):
52-
"""Extracts text content with hierarchy, positioning, and formatting."""
111+
"""Extracts hyperlink addresses from shapes."""
53112

54113
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
55114
"""Extract hyperlink content from the presentation or a specific slide."""
56-
slides = [slide] if slide else list(presentation.slides)
57-
58-
elements: list[TextElement] = []
59-
for slide_idx, sld in enumerate(slides, start=1):
60-
for shape in sld.shapes:
61-
if shape.click_action.hyperlink.address:
62-
shape.has_text_frame
63-
element = TextElement(
64-
element_type="text",
65-
document_meta=document_meta,
66-
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
67-
content=shape.click_action.hyperlink.address,
68-
)
69-
elements.append(element)
70-
71-
return elements
115+
return self._extract_from_shapes(
116+
presentation=presentation,
117+
document_meta=document_meta,
118+
slide=slide,
119+
shape_filter=lambda shape: hasattr(shape, 'click_action') and shape.click_action.hyperlink.address,
120+
content_extractor=lambda shape: shape.click_action.hyperlink.address,
121+
element_type="hyperlink"
122+
)
72123

73-
74124
def get_extractor_name(self) -> str:
75125
"""Get the name of this extractor."""
76126
return "pptx_hyperlink_extractor"
77-
78-
class PptxImageExtractor(BasePptxExtractor):
79-
"""Extracts text content with hierarchy, positioning, and formatting."""
80127

81-
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
82-
"""Extract hyperlink content from the presentation or a specific slide."""
83-
slides = [slide] if slide else list(presentation.slides)
84128

85-
elements: list[TextElement] = []
86-
for slide_idx, sld in enumerate(slides, start=1):
87-
for shape in sld.shapes:
88-
if shape.click_action.hyperlink:
89-
text_frame = shape.text_frame
90-
text = str(text_frame.text).strip()
91-
element = TextElement(
92-
element_type="text",
93-
document_meta=document_meta,
94-
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
95-
content=text,
96-
)
97-
elements.append(element)
129+
class PptxImageExtractor(BasePptxExtractor):
130+
"""Extracts image information from shapes."""
98131

99-
return elements
132+
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
133+
"""Extract image content from the presentation or a specific slide."""
134+
return self._extract_from_shapes(
135+
presentation=presentation,
136+
document_meta=document_meta,
137+
slide=slide,
138+
shape_filter=lambda shape: hasattr(shape, 'image') and shape.image is not None,
139+
content_extractor=lambda shape: f"Image: {shape.image.filename if hasattr(shape.image, 'filename') else 'embedded_image'}",
140+
element_type="image"
141+
)
100142

101-
102143
def get_extractor_name(self) -> str:
103144
"""Get the name of this extractor."""
104145
return "pptx_image_extractor"
105146

106147

107148
class PptxShapeExtractor(BasePptxExtractor):
108-
"""Extracts text content with hierarchy, positioning, and formatting."""
149+
"""Extracts shape information and metadata."""
109150

110151
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
111-
"""Extract hyperlink content from the presentation or a specific slide."""
112-
slides = [slide] if slide else list(presentation.slides)
152+
"""Extract shape metadata from the presentation or a specific slide."""
153+
return self._extract_from_shapes(
154+
presentation=presentation,
155+
document_meta=document_meta,
156+
slide=slide,
157+
shape_filter=lambda shape: hasattr(shape, 'shape_type'),
158+
content_extractor=lambda shape: f"Shape: {shape.shape_type}",
159+
element_type="shape"
160+
)
113161

114-
elements: list[TextElement] = []
115-
for slide_idx, sld in enumerate(slides, start=1):
116-
for shape in sld.shapes:
117-
if shape.click_action.hyperlink:
118-
text_frame = shape.text_frame
119-
text = str(text_frame.text).strip()
120-
element = TextElement(
121-
element_type="text",
122-
document_meta=document_meta,
123-
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
124-
content=text,
125-
)
126-
elements.append(element)
127-
128-
return elements
129-
130-
131162
def get_extractor_name(self) -> str:
132163
"""Get the name of this extractor."""
133164
return "pptx_shape_extractor"
134-
165+
166+
135167
class PptxMetadataExtractor(BasePptxExtractor):
136-
"""Extracts text content with hierarchy, positioning, and formatting."""
168+
"""Extracts document metadata."""
137169

138170
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
139-
"""Extract hyperlink content from the presentation or a specific slide."""
171+
"""Extract metadata from the presentation."""
140172
core_properties = presentation.core_properties
141173
properties = [
142-
core_properties.author,
143-
core_properties.title,
144-
core_properties.subject,
145-
core_properties.keywords,
146-
core_properties.category,
147-
core_properties.created,
148-
core_properties.modified,
174+
("author", core_properties.author),
175+
("title", core_properties.title),
176+
("subject", core_properties.subject),
177+
("keywords", core_properties.keywords),
178+
("category", core_properties.category),
179+
("created", str(core_properties.created) if core_properties.created else None),
180+
("modified", str(core_properties.modified) if core_properties.modified else None),
149181
]
150182

151183
elements = []
152-
for prop in properties:
153-
if prop is not None:
154-
elements.append(TextElement(
184+
for prop_name, prop_value in properties:
185+
if prop_value is not None and str(prop_value).strip():
186+
element = self._create_text_element(
155187
element_type="metadata",
156188
document_meta=document_meta,
157-
content=prop,
158-
))
189+
content=f"{prop_name}: {prop_value}",
190+
slide_idx=0
191+
)
192+
elements.append(element)
159193

160194
return elements
161195

162-
163196
def get_extractor_name(self) -> str:
164197
"""Get the name of this extractor."""
165198
return "pptx_metadata_extractor"
166-
199+
200+
167201
class PptxSpeakerNotesExtractor(BasePptxExtractor):
168-
"""Extracts text content with hierarchy, positioning, and formatting."""
202+
"""Extracts speaker notes from slides."""
169203

170204
def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide: Slide | None = None) -> list[TextElement]:
171-
"""Extract hyperlink content from the presentation or a specific slide."""
172-
slides = [slide] if slide else list(presentation.slides)
173-
205+
"""Extract speaker notes from the presentation or a specific slide."""
174206
elements: list[TextElement] = []
175-
for slide_idx, sld in enumerate(slides, start=1):
207+
208+
for slide_idx, sld in self._get_slides(presentation, slide):
176209
if sld.has_notes_slide and sld.notes_slide.notes_text_frame is not None:
177210
notes_slide = sld.notes_slide
178211
notes_text_frame = notes_slide.notes_text_frame
179-
text = notes_text_frame.text
180-
element = TextElement(
181-
element_type="text",
212+
text = notes_text_frame.text.strip()
213+
214+
if text:
215+
coordinates = {
216+
"left": notes_text_frame.margin_left,
217+
"right": notes_text_frame.margin_right,
218+
"top": notes_text_frame.margin_top,
219+
"bottom": notes_text_frame.margin_bottom
220+
}
221+
222+
element = self._create_text_element(
223+
element_type="speaker_notes",
182224
document_meta=document_meta,
183-
location=ElementLocation(page_number=slide_idx, coordinates={"left": notes_text_frame.margin_left, "right": notes_text_frame.margin_right, "top": notes_text_frame.margin_top, "bottom": notes_text_frame.margin_bottom}),
184225
content=text,
226+
slide_idx=slide_idx,
227+
coordinates=coordinates
185228
)
186-
elements.append(element)
229+
elements.append(element)
230+
187231
for shape in notes_slide.shapes:
188232
if shape.has_text_frame:
189-
text_frame = shape.text_frame
190-
text = str(text_frame.text).strip()
191-
element = TextElement(
192-
element_type="text",
193-
document_meta=document_meta,
194-
location=ElementLocation(page_number=slide_idx, coordinates={"left": shape.left, "top": shape.top, "width": shape.width, "height": shape.height}),
195-
content=text,
196-
)
197-
elements.append(element)
233+
text = str(shape.text_frame.text).strip()
234+
if text:
235+
element = self._create_text_element(
236+
element_type="speaker_notes",
237+
document_meta=document_meta,
238+
content=text,
239+
slide_idx=slide_idx,
240+
shape=shape
241+
)
242+
elements.append(element)
198243

199244
return elements
200245

201-
202246
def get_extractor_name(self) -> str:
203247
"""Get the name of this extractor."""
204248
return "pptx_speaker_notes_extractor"
205249

250+
206251
DEFAULT_EXTRACTORS = [
207252
PptxTextExtractor(),
208253
PptxHyperlinkExtractor(),
209254
PptxImageExtractor(),
210255
PptxShapeExtractor(),
211256
PptxSpeakerNotesExtractor(),
212257
PptxMetadataExtractor(),
213-
# PptxSlideImageExtractor(),
214-
]
258+
PptxSlideImageExtractor(),
259+
]

0 commit comments

Comments
 (0)