1
+ from __future__ import annotations
2
+
3
+ from abc import ABC , abstractmethod
4
+ from typing import Callable , Any
5
+
6
+ from pptx .presentation import Presentation
7
+ from pptx .slide import Slide
8
+ from pptx .shapes .base import BaseShape
9
+
10
+ from ragbits .document_search .documents .document import DocumentMeta
11
+ from ragbits .document_search .documents .element import ElementLocation , TextElement
12
+
13
+
14
+ class BasePptxExtractor (ABC ):
15
+ """Base class for all PPTX content extractors."""
16
+
17
+ def _get_slides (self , presentation : Presentation , slide : Slide | None = None ) -> list [tuple [int , Slide ]]:
18
+ """Get slides with their indices."""
19
+ slides = [slide ] if slide else list (presentation .slides )
20
+ return list (enumerate (slides , start = 1 ))
21
+
22
+ def _create_text_element (
23
+ self ,
24
+ element_type : str ,
25
+ document_meta : DocumentMeta ,
26
+ content : str ,
27
+ slide_idx : int ,
28
+ shape : BaseShape | None = None ,
29
+ coordinates : dict [str , Any ] | None = None
30
+ ) -> TextElement :
31
+ """Create a TextElement with standardized location."""
32
+ if coordinates is None and shape is not None :
33
+ coordinates = {
34
+ "left" : shape .left ,
35
+ "top" : shape .top ,
36
+ "width" : shape .width ,
37
+ "height" : shape .height
38
+ }
39
+
40
+ location = ElementLocation (
41
+ page_number = slide_idx ,
42
+ coordinates = coordinates or {}
43
+ )
44
+
45
+ return TextElement (
46
+ element_type = element_type ,
47
+ document_meta = document_meta ,
48
+ location = location ,
49
+ content = content
50
+ )
51
+
52
+ def _extract_from_shapes (
53
+ self ,
54
+ presentation : Presentation ,
55
+ document_meta : DocumentMeta ,
56
+ slide : Slide | None ,
57
+ shape_filter : Callable [[BaseShape ], bool ],
58
+ content_extractor : Callable [[BaseShape ], str ],
59
+ element_type : str = "text"
60
+ ) -> list [TextElement ]:
61
+ """Generic method to extract content from shapes based on filter and extractor."""
62
+ elements : list [TextElement ] = []
63
+
64
+ for slide_idx , sld in self ._get_slides (presentation , slide ):
65
+ for shape in sld .shapes :
66
+ if shape_filter (shape ):
67
+ try :
68
+ content = content_extractor (shape )
69
+ if content .strip ():
70
+ element = self ._create_text_element (
71
+ element_type = element_type ,
72
+ document_meta = document_meta ,
73
+ content = content ,
74
+ slide_idx = slide_idx ,
75
+ shape = shape
76
+ )
77
+ elements .append (element )
78
+ except (AttributeError , TypeError ):
79
+ continue
80
+
81
+ return elements
82
+
83
+ @abstractmethod
84
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
85
+ """Extract content from the presentation or specific slide."""
86
+
87
+ @abstractmethod
88
+ def get_extractor_name (self ) -> str :
89
+ """Get the name of this extractor."""
90
+
91
+
92
+ class PptxTextExtractor (BasePptxExtractor ):
93
+ """Extracts text content from text frames."""
94
+
95
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
96
+ """Extract text content from the presentation or a specific slide."""
97
+ return self ._extract_from_shapes (
98
+ presentation = presentation ,
99
+ document_meta = document_meta ,
100
+ slide = slide ,
101
+ shape_filter = lambda shape : shape .has_text_frame ,
102
+ content_extractor = lambda shape : str (shape .text_frame .text ).strip ()
103
+ )
104
+
105
+ def get_extractor_name (self ) -> str :
106
+ """Get the name of this extractor."""
107
+ return "pptx_text_extractor"
108
+
109
+
110
+ class PptxHyperlinkExtractor (BasePptxExtractor ):
111
+ """Extracts hyperlink addresses from shapes."""
112
+
113
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
114
+ """Extract hyperlink content from the presentation or a specific slide."""
115
+ return self ._extract_from_shapes (
116
+ presentation = presentation ,
117
+ document_meta = document_meta ,
118
+ slide = slide ,
119
+ shape_filter = lambda shape : hasattr (shape , 'click_action' ) and shape .click_action .hyperlink .address ,
120
+ content_extractor = lambda shape : shape .click_action .hyperlink .address ,
121
+ element_type = "hyperlink"
122
+ )
123
+
124
+ def get_extractor_name (self ) -> str :
125
+ """Get the name of this extractor."""
126
+ return "pptx_hyperlink_extractor"
127
+
128
+
129
+ class PptxImageExtractor (BasePptxExtractor ):
130
+ """Extracts image information from shapes."""
131
+
132
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
133
+ """Extract image content from the presentation or a specific slide."""
134
+ return self ._extract_from_shapes (
135
+ presentation = presentation ,
136
+ document_meta = document_meta ,
137
+ slide = slide ,
138
+ shape_filter = lambda shape : shape .image and shape .image is not None ,
139
+ content_extractor = lambda shape : f"Image: { shape .image .filename if hasattr (shape .image , 'filename' ) else 'embedded_image' } " ,
140
+ element_type = "image"
141
+ )
142
+
143
+ def get_extractor_name (self ) -> str :
144
+ """Get the name of this extractor."""
145
+ return "pptx_image_extractor"
146
+
147
+
148
+ class PptxShapeExtractor (BasePptxExtractor ):
149
+ """Extracts shape information and metadata."""
150
+
151
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
152
+ """Extract shape metadata from the presentation or a specific slide."""
153
+ return self ._extract_from_shapes (
154
+ presentation = presentation ,
155
+ document_meta = document_meta ,
156
+ slide = slide ,
157
+ shape_filter = lambda shape : hasattr (shape , 'shape_type' ),
158
+ content_extractor = lambda shape : f"Shape: { shape .shape_type } " ,
159
+ element_type = "shape"
160
+ )
161
+
162
+ def get_extractor_name (self ) -> str :
163
+ """Get the name of this extractor."""
164
+ return "pptx_shape_extractor"
165
+
166
+
167
+ class PptxMetadataExtractor (BasePptxExtractor ):
168
+ """Extracts document metadata."""
169
+
170
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
171
+ """Extract metadata from the presentation."""
172
+ core_properties = presentation .core_properties
173
+ properties = [
174
+ ("author" , core_properties .author ),
175
+ ("title" , core_properties .title ),
176
+ ("subject" , core_properties .subject ),
177
+ ("keywords" , core_properties .keywords ),
178
+ ("category" , core_properties .category ),
179
+ ("created" , str (core_properties .created ) if core_properties .created else None ),
180
+ ("modified" , str (core_properties .modified ) if core_properties .modified else None ),
181
+ ]
182
+
183
+ elements = []
184
+ for prop_name , prop_value in properties :
185
+ if prop_value is not None and str (prop_value ).strip ():
186
+ element = self ._create_text_element (
187
+ element_type = "metadata" ,
188
+ document_meta = document_meta ,
189
+ content = f"{ prop_name } : { prop_value } " ,
190
+ slide_idx = 0
191
+ )
192
+ elements .append (element )
193
+
194
+ return elements
195
+
196
+ def get_extractor_name (self ) -> str :
197
+ """Get the name of this extractor."""
198
+ return "pptx_metadata_extractor"
199
+
200
+
201
+ class PptxSpeakerNotesExtractor (BasePptxExtractor ):
202
+ """Extracts speaker notes from slides."""
203
+
204
+ def extract (self , presentation : Presentation , document_meta : DocumentMeta , slide : Slide | None = None ) -> list [TextElement ]:
205
+ """Extract speaker notes from the presentation or a specific slide."""
206
+ elements : list [TextElement ] = []
207
+
208
+ for slide_idx , sld in self ._get_slides (presentation , slide ):
209
+ if sld .has_notes_slide and sld .notes_slide .notes_text_frame is not None :
210
+ notes_slide = sld .notes_slide
211
+ notes_text_frame = notes_slide .notes_text_frame
212
+ text = notes_text_frame .text .strip () if notes_text_frame is not None else None
213
+
214
+ if text and notes_text_frame is not None :
215
+ coordinates = {
216
+ "left" : notes_text_frame .margin_left ,
217
+ "right" : notes_text_frame .margin_right ,
218
+ "top" : notes_text_frame .margin_top ,
219
+ "bottom" : notes_text_frame .margin_bottom
220
+ }
221
+
222
+ element = self ._create_text_element (
223
+ element_type = "speaker_notes" ,
224
+ document_meta = document_meta ,
225
+ content = text ,
226
+ slide_idx = slide_idx ,
227
+ coordinates = coordinates
228
+ )
229
+ elements .append (element )
230
+
231
+ return elements
232
+
233
+ def get_extractor_name (self ) -> str :
234
+ """Get the name of this extractor."""
235
+ return "pptx_speaker_notes_extractor"
236
+
237
+
238
+ DEFAULT_EXTRACTORS = [
239
+ PptxTextExtractor (),
240
+ PptxHyperlinkExtractor (),
241
+ PptxImageExtractor (),
242
+ PptxShapeExtractor (),
243
+ PptxSpeakerNotesExtractor (),
244
+ PptxMetadataExtractor (),
245
+ ]
0 commit comments