1
1
from __future__ import annotations
2
2
3
3
from abc import ABC , abstractmethod
4
+ from typing import Callable , Any
4
5
5
6
from pptx .presentation import Presentation
6
7
from pptx .slide import Slide
8
+ from pptx .shapes .base import BaseShape
7
9
8
10
from ragbits .document_search .documents .document import DocumentMeta
9
11
from ragbits .document_search .documents .element import ElementLocation , TextElement
12
14
class BasePptxExtractor (ABC ):
13
15
"""Base class for all PPTX content extractors."""
14
16
17
+ def _get_slides (self , presentation : Presentation , slide : Slide | None = None ) -> list [tuple [int , Slide ]]:
18
+ """Get slides with their indices."""
19
+ slides = [slide ] if slide else list (presentation .slides )
20
+ return list (enumerate (slides , start = 1 ))
21
+
22
+ def _create_text_element (
23
+ self ,
24
+ element_type : str ,
25
+ document_meta : DocumentMeta ,
26
+ content : str ,
27
+ slide_idx : int ,
28
+ shape : BaseShape | None = None ,
29
+ coordinates : dict [str , Any ] | None = None
30
+ ) -> TextElement :
31
+ """Create a TextElement with standardized location."""
32
+ if coordinates is None and shape is not None :
33
+ coordinates = {
34
+ "left" : shape .left ,
35
+ "top" : shape .top ,
36
+ "width" : shape .width ,
37
+ "height" : shape .height
38
+ }
39
+
40
+ location = ElementLocation (
41
+ page_number = slide_idx ,
42
+ coordinates = coordinates or {}
43
+ )
44
+
45
+ return TextElement (
46
+ element_type = element_type ,
47
+ document_meta = document_meta ,
48
+ location = location ,
49
+ content = content
50
+ )
51
+
52
+ def _extract_from_shapes (
53
+ self ,
54
+ presentation : Presentation ,
55
+ document_meta : DocumentMeta ,
56
+ slide : Slide | None ,
57
+ shape_filter : Callable [[BaseShape ], bool ],
58
+ content_extractor : Callable [[BaseShape ], str ],
59
+ element_type : str = "text"
60
+ ) -> list [TextElement ]:
61
+ """Generic method to extract content from shapes based on filter and extractor."""
62
+ elements : list [TextElement ] = []
63
+
64
+ for slide_idx , sld in self ._get_slides (presentation , slide ):
65
+ for shape in sld .shapes :
66
+ if shape_filter (shape ):
67
+ try :
68
+ content = content_extractor (shape )
69
+ if content .strip ():
70
+ element = self ._create_text_element (
71
+ element_type = element_type ,
72
+ document_meta = document_meta ,
73
+ content = content ,
74
+ slide_idx = slide_idx ,
75
+ shape = shape
76
+ )
77
+ elements .append (element )
78
+ except (AttributeError , TypeError ):
79
+ continue
80
+
81
+ return elements
82
+
15
83
@abstractmethod
16
84
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
17
85
"""Extract content from the presentation or specific slide."""
@@ -22,193 +90,170 @@ def get_extractor_name(self) -> str:
22
90
23
91
24
92
class PptxTextExtractor (BasePptxExtractor ):
25
- """Extracts text content with hierarchy, positioning, and formatting ."""
93
+ """Extracts text content from text frames ."""
26
94
27
95
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
28
96
"""Extract text content from the presentation or a specific slide."""
29
- slides = [slide ] if slide else list (presentation .slides )
30
-
31
- elements : list [TextElement ] = []
32
- for slide_idx , sld in enumerate (slides , start = 1 ):
33
- for shape in sld .shapes :
34
- if shape .has_text_frame :
35
- text_frame = shape .text_frame
36
- text = str (text_frame .text ).strip ()
37
- element = TextElement (
38
- element_type = "text" ,
39
- document_meta = document_meta ,
40
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : shape .left , "top" : shape .top , "width" : shape .width , "height" : shape .height }),
41
- content = text ,
42
- )
43
- elements .append (element )
44
-
45
- return elements
97
+ return self ._extract_from_shapes (
98
+ presentation = presentation ,
99
+ document_meta = document_meta ,
100
+ slide = slide ,
101
+ shape_filter = lambda shape : shape .has_text_frame ,
102
+ content_extractor = lambda shape : str (shape .text_frame .text ).strip ()
103
+ )
46
104
47
105
def get_extractor_name (self ) -> str :
48
106
"""Get the name of this extractor."""
49
107
return "pptx_text_extractor"
50
108
109
+
51
110
class PptxHyperlinkExtractor (BasePptxExtractor ):
52
- """Extracts text content with hierarchy, positioning, and formatting ."""
111
+ """Extracts hyperlink addresses from shapes ."""
53
112
54
113
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
55
114
"""Extract hyperlink content from the presentation or a specific slide."""
56
- slides = [slide ] if slide else list (presentation .slides )
57
-
58
- elements : list [TextElement ] = []
59
- for slide_idx , sld in enumerate (slides , start = 1 ):
60
- for shape in sld .shapes :
61
- if shape .click_action .hyperlink .address :
62
- shape .has_text_frame
63
- element = TextElement (
64
- element_type = "text" ,
65
- document_meta = document_meta ,
66
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : shape .left , "top" : shape .top , "width" : shape .width , "height" : shape .height }),
67
- content = shape .click_action .hyperlink .address ,
68
- )
69
- elements .append (element )
70
-
71
- return elements
115
+ return self ._extract_from_shapes (
116
+ presentation = presentation ,
117
+ document_meta = document_meta ,
118
+ slide = slide ,
119
+ shape_filter = lambda shape : hasattr (shape , 'click_action' ) and shape .click_action .hyperlink .address ,
120
+ content_extractor = lambda shape : shape .click_action .hyperlink .address ,
121
+ element_type = "hyperlink"
122
+ )
72
123
73
-
74
124
def get_extractor_name (self ) -> str :
75
125
"""Get the name of this extractor."""
76
126
return "pptx_hyperlink_extractor"
77
-
78
- class PptxImageExtractor (BasePptxExtractor ):
79
- """Extracts text content with hierarchy, positioning, and formatting."""
80
127
81
- def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
82
- """Extract hyperlink content from the presentation or a specific slide."""
83
- slides = [slide ] if slide else list (presentation .slides )
84
128
85
- elements : list [TextElement ] = []
86
- for slide_idx , sld in enumerate (slides , start = 1 ):
87
- for shape in sld .shapes :
88
- if shape .click_action .hyperlink :
89
- text_frame = shape .text_frame
90
- text = str (text_frame .text ).strip ()
91
- element = TextElement (
92
- element_type = "text" ,
93
- document_meta = document_meta ,
94
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : shape .left , "top" : shape .top , "width" : shape .width , "height" : shape .height }),
95
- content = text ,
96
- )
97
- elements .append (element )
129
+ class PptxImageExtractor (BasePptxExtractor ):
130
+ """Extracts image information from shapes."""
98
131
99
- return elements
132
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
133
+ """Extract image content from the presentation or a specific slide."""
134
+ return self ._extract_from_shapes (
135
+ presentation = presentation ,
136
+ document_meta = document_meta ,
137
+ slide = slide ,
138
+ shape_filter = lambda shape : hasattr (shape , 'image' ) and shape .image is not None ,
139
+ content_extractor = lambda shape : f"Image: { shape .image .filename if hasattr (shape .image , 'filename' ) else 'embedded_image' } " ,
140
+ element_type = "image"
141
+ )
100
142
101
-
102
143
def get_extractor_name (self ) -> str :
103
144
"""Get the name of this extractor."""
104
145
return "pptx_image_extractor"
105
146
106
147
107
148
class PptxShapeExtractor (BasePptxExtractor ):
108
- """Extracts text content with hierarchy, positioning, and formatting ."""
149
+ """Extracts shape information and metadata ."""
109
150
110
151
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
111
- """Extract hyperlink content from the presentation or a specific slide."""
112
- slides = [slide ] if slide else list (presentation .slides )
152
+ """Extract shape metadata from the presentation or a specific slide."""
153
+ return self ._extract_from_shapes (
154
+ presentation = presentation ,
155
+ document_meta = document_meta ,
156
+ slide = slide ,
157
+ shape_filter = lambda shape : hasattr (shape , 'shape_type' ),
158
+ content_extractor = lambda shape : f"Shape: { shape .shape_type } " ,
159
+ element_type = "shape"
160
+ )
113
161
114
- elements : list [TextElement ] = []
115
- for slide_idx , sld in enumerate (slides , start = 1 ):
116
- for shape in sld .shapes :
117
- if shape .click_action .hyperlink :
118
- text_frame = shape .text_frame
119
- text = str (text_frame .text ).strip ()
120
- element = TextElement (
121
- element_type = "text" ,
122
- document_meta = document_meta ,
123
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : shape .left , "top" : shape .top , "width" : shape .width , "height" : shape .height }),
124
- content = text ,
125
- )
126
- elements .append (element )
127
-
128
- return elements
129
-
130
-
131
162
def get_extractor_name (self ) -> str :
132
163
"""Get the name of this extractor."""
133
164
return "pptx_shape_extractor"
134
-
165
+
166
+
135
167
class PptxMetadataExtractor (BasePptxExtractor ):
136
- """Extracts text content with hierarchy, positioning, and formatting ."""
168
+ """Extracts document metadata ."""
137
169
138
170
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
139
- """Extract hyperlink content from the presentation or a specific slide ."""
171
+ """Extract metadata from the presentation."""
140
172
core_properties = presentation .core_properties
141
173
properties = [
142
- core_properties .author ,
143
- core_properties .title ,
144
- core_properties .subject ,
145
- core_properties .keywords ,
146
- core_properties .category ,
147
- core_properties .created ,
148
- core_properties .modified ,
174
+ ( "author" , core_properties .author ) ,
175
+ ( "title" , core_properties .title ) ,
176
+ ( "subject" , core_properties .subject ) ,
177
+ ( "keywords" , core_properties .keywords ) ,
178
+ ( "category" , core_properties .category ) ,
179
+ ( "created" , str ( core_properties .created ) if core_properties . created else None ) ,
180
+ ( "modified" , str ( core_properties .modified ) if core_properties . modified else None ) ,
149
181
]
150
182
151
183
elements = []
152
- for prop in properties :
153
- if prop is not None :
154
- elements . append ( TextElement (
184
+ for prop_name , prop_value in properties :
185
+ if prop_value is not None and str ( prop_value ). strip () :
186
+ element = self . _create_text_element (
155
187
element_type = "metadata" ,
156
188
document_meta = document_meta ,
157
- content = prop ,
158
- ))
189
+ content = f"{ prop_name } : { prop_value } " ,
190
+ slide_idx = 0
191
+ )
192
+ elements .append (element )
159
193
160
194
return elements
161
195
162
-
163
196
def get_extractor_name (self ) -> str :
164
197
"""Get the name of this extractor."""
165
198
return "pptx_metadata_extractor"
166
-
199
+
200
+
167
201
class PptxSpeakerNotesExtractor (BasePptxExtractor ):
168
- """Extracts text content with hierarchy, positioning, and formatting ."""
202
+ """Extracts speaker notes from slides ."""
169
203
170
204
def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
171
- """Extract hyperlink content from the presentation or a specific slide."""
172
- slides = [slide ] if slide else list (presentation .slides )
173
-
205
+ """Extract speaker notes from the presentation or a specific slide."""
174
206
elements : list [TextElement ] = []
175
- for slide_idx , sld in enumerate (slides , start = 1 ):
207
+
208
+ for slide_idx , sld in self ._get_slides (presentation , slide ):
176
209
if sld .has_notes_slide and sld .notes_slide .notes_text_frame is not None :
177
210
notes_slide = sld .notes_slide
178
211
notes_text_frame = notes_slide .notes_text_frame
179
- text = notes_text_frame .text
180
- element = TextElement (
181
- element_type = "text" ,
212
+ text = notes_text_frame .text .strip ()
213
+
214
+ if text :
215
+ coordinates = {
216
+ "left" : notes_text_frame .margin_left ,
217
+ "right" : notes_text_frame .margin_right ,
218
+ "top" : notes_text_frame .margin_top ,
219
+ "bottom" : notes_text_frame .margin_bottom
220
+ }
221
+
222
+ element = self ._create_text_element (
223
+ element_type = "speaker_notes" ,
182
224
document_meta = document_meta ,
183
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : notes_text_frame .margin_left , "right" : notes_text_frame .margin_right , "top" : notes_text_frame .margin_top , "bottom" : notes_text_frame .margin_bottom }),
184
225
content = text ,
226
+ slide_idx = slide_idx ,
227
+ coordinates = coordinates
185
228
)
186
- elements .append (element )
229
+ elements .append (element )
230
+
187
231
for shape in notes_slide .shapes :
188
232
if shape .has_text_frame :
189
- text_frame = shape .text_frame
190
- text = str (text_frame .text ).strip ()
191
- element = TextElement (
192
- element_type = "text" ,
193
- document_meta = document_meta ,
194
- location = ElementLocation (page_number = slide_idx , coordinates = {"left" : shape .left , "top" : shape .top , "width" : shape .width , "height" : shape .height }),
195
- content = text ,
196
- )
197
- elements .append (element )
233
+ text = str (shape .text_frame .text ).strip ()
234
+ if text :
235
+ element = self ._create_text_element (
236
+ element_type = "speaker_notes" ,
237
+ document_meta = document_meta ,
238
+ content = text ,
239
+ slide_idx = slide_idx ,
240
+ shape = shape
241
+ )
242
+ elements .append (element )
198
243
199
244
return elements
200
245
201
-
202
246
def get_extractor_name (self ) -> str :
203
247
"""Get the name of this extractor."""
204
248
return "pptx_speaker_notes_extractor"
205
249
250
+
206
251
DEFAULT_EXTRACTORS = [
207
252
PptxTextExtractor (),
208
253
PptxHyperlinkExtractor (),
209
254
PptxImageExtractor (),
210
255
PptxShapeExtractor (),
211
256
PptxSpeakerNotesExtractor (),
212
257
PptxMetadataExtractor (),
213
- # PptxSlideImageExtractor(),
214
- ]
258
+ PptxSlideImageExtractor (),
259
+ ]
0 commit comments