9
9
from pptx .presentation import Presentation
10
10
from pptx .slide import Slide
11
11
12
- from ragbits .document_search .documents .element import Element
12
+ from ragbits .document_search .documents .element import Element , ImageElement , TextElement
13
13
14
- from .dataclasses import (
15
- ExtractedHyperlink ,
16
- ExtractedImage ,
17
- ExtractedMetadata ,
18
- ExtractedShape ,
19
- ExtractedSlideImage ,
20
- ExtractedSpeakerNotes ,
21
- ExtractedText ,
22
- )
23
14
24
-
25
- class BaseExtractor (ABC ):
15
+ class BasePptxExtractor (ABC ):
26
16
"""Base class for all PPTX content extractors."""
27
17
28
18
@abstractmethod
@@ -36,10 +26,10 @@ def get_extractor_name(self) -> str:
36
26
pass
37
27
38
28
39
- class TextExtractor ( BaseExtractor ):
29
+ class PptxTextExtractor ( BasePptxExtractor ):
40
30
"""Extracts text content with hierarchy, positioning, and formatting."""
41
31
42
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedText ]:
32
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
43
33
"""Extract text content from all slides or a specific slide."""
44
34
extracted_texts = []
45
35
@@ -57,7 +47,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
57
47
for run in paragraph .runs :
58
48
if run .text .strip ():
59
49
extracted_texts .append (
60
- ExtractedText (
50
+ TextElement (
61
51
content = run .text ,
62
52
slide_index = slide_idx ,
63
53
shape_id = shape .shape_id ,
@@ -122,10 +112,10 @@ def get_extractor_name(self) -> str:
122
112
return "text"
123
113
124
114
125
- class HyperlinkExtractor ( BaseExtractor ):
115
+ class PptxHyperlinkExtractor ( BasePptxExtractor ):
126
116
"""Extracts hyperlinks from shapes and text runs."""
127
117
128
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedHyperlink ]:
118
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
129
119
"""Extract hyperlinks from all slides or a specific slide."""
130
120
extracted_hyperlinks = []
131
121
@@ -142,7 +132,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
142
132
hyperlink_info = self ._extract_hyperlink_from_action (shape .click_action )
143
133
if hyperlink_info :
144
134
extracted_hyperlinks .append (
145
- ExtractedHyperlink (
135
+ Element (
146
136
url = hyperlink_info ["url" ],
147
137
display_text = hyperlink_info ["display_text" ],
148
138
slide_index = slide_idx ,
@@ -159,7 +149,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
159
149
for run in paragraph .runs :
160
150
if hasattr (run , "hyperlink" ) and run .hyperlink and run .hyperlink .address :
161
151
extracted_hyperlinks .append (
162
- ExtractedHyperlink (
152
+ Element (
163
153
url = run .hyperlink .address ,
164
154
display_text = run .text ,
165
155
slide_index = slide_idx ,
@@ -199,10 +189,10 @@ def get_extractor_name(self) -> str:
199
189
return "hyperlink"
200
190
201
191
202
- class ImageExtractor ( BaseExtractor ):
192
+ class PptxImageExtractor ( BasePptxExtractor ):
203
193
"""Extracts embedded images from slides."""
204
194
205
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedImage ]:
195
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
206
196
"""Extract images from all slides or a specific slide."""
207
197
extracted_images = []
208
198
@@ -220,7 +210,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
220
210
image_format = self ._get_image_format (image_bytes )
221
211
222
212
extracted_images .append (
223
- ExtractedImage (
213
+ ImageElement (
224
214
image_bytes = image_bytes ,
225
215
slide_index = slide_idx ,
226
216
shape_id = shape .shape_id ,
@@ -259,10 +249,10 @@ def get_extractor_name(self) -> str:
259
249
return "image"
260
250
261
251
262
- class ShapeExtractor ( BaseExtractor ):
252
+ class PptxShapeExtractor ( BasePptxExtractor ):
263
253
"""Extracts shape information including positioning and styling."""
264
254
265
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedShape ]:
255
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
266
256
"""Extract shapes from all slides or a specific slide."""
267
257
extracted_shapes = []
268
258
@@ -280,7 +270,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
280
270
)
281
271
282
272
extracted_shapes .append (
283
- ExtractedShape (
273
+ Element (
284
274
shape_type = self ._get_shape_type_name (shape .shape_type ),
285
275
slide_index = slide_idx ,
286
276
shape_id = shape .shape_id ,
@@ -338,10 +328,10 @@ def get_extractor_name(self) -> str:
338
328
return "shape"
339
329
340
330
341
- class SpeakerNotesExtractor ( BaseExtractor ):
331
+ class PptxSpeakerNotesExtractor ( BasePptxExtractor ):
342
332
"""Extracts speaker notes from slides."""
343
333
344
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedSpeakerNotes ]:
334
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
345
335
"""Extract notes from all slides or a specific slide."""
346
336
extracted_notes = []
347
337
@@ -365,7 +355,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
365
355
if notes_text .strip ():
366
356
formatting = self ._extract_notes_formatting (notes_slide .notes_text_frame )
367
357
extracted_notes .append (
368
- ExtractedSpeakerNotes (content = notes_text , slide_index = slide_idx , formatting = formatting )
358
+ Element (content = notes_text , slide_index = slide_idx , formatting = formatting )
369
359
)
370
360
371
361
return extracted_notes
@@ -392,10 +382,10 @@ def get_extractor_name(self) -> str:
392
382
return "notes"
393
383
394
384
395
- class SlideImageExtractor ( BaseExtractor ):
385
+ class PptxSlideImageExtractor ( BasePptxExtractor ):
396
386
"""Extracts each slide as an image."""
397
387
398
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedSlideImage ]:
388
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
399
389
"""Extract slides as images - placeholder implementation."""
400
390
# Note: This would require additional libraries like python-pptx-interface
401
391
# or conversion tools to render slides as images
@@ -406,14 +396,14 @@ def get_extractor_name(self) -> str:
406
396
return "slide_image"
407
397
408
398
409
- class MetadataExtractor ( BaseExtractor ):
399
+ class PptxMetadataExtractor ( BasePptxExtractor ):
410
400
"""Extracts document metadata and properties."""
411
401
412
- def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [ExtractedMetadata ]:
402
+ def extract (self , presentation : Presentation , slide : Slide | None = None ) -> list [Element ]:
413
403
"""Extract metadata from the presentation."""
414
404
core_props = presentation .core_properties
415
405
416
- metadata = ExtractedMetadata (
406
+ metadata = Element (
417
407
title = core_props .title ,
418
408
author = core_props .author ,
419
409
subject = core_props .subject ,
@@ -446,13 +436,12 @@ def get_extractor_name(self) -> str:
446
436
return "metadata"
447
437
448
438
449
- # Default list of extractors
450
439
DEFAULT_EXTRACTORS = [
451
- TextExtractor (),
452
- HyperlinkExtractor (),
453
- ImageExtractor (),
454
- ShapeExtractor (),
455
- SpeakerNotesExtractor (),
456
- MetadataExtractor (),
457
- SlideImageExtractor (),
440
+ PptxTextExtractor (),
441
+ PptxHyperlinkExtractor (),
442
+ PptxImageExtractor (),
443
+ PptxShapeExtractor (),
444
+ PptxSpeakerNotesExtractor (),
445
+ PptxMetadataExtractor (),
446
+ PptxSlideImageExtractor (),
458
447
]
0 commit comments