Skip to content

Commit 888eadf

Browse files
committed
refactor(pptx): enhance PPTX extraction classes and remove unused dataclasses
- Renamed and refactored extractor classes to follow a consistent naming convention, changing BaseExtractor to BasePptxExtractor and updating derived classes accordingly. - Removed the dataclasses.py file as it contained unused data structures. - Updated the DEFAULT_EXTRACTORS list to include the new extractor classes. - Added python-pptx as a dependency in uv.lock and updated the requirements.
1 parent d394c80 commit 888eadf

File tree

4 files changed

+43
-162
lines changed

4 files changed

+43
-162
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .parser import PptxDocumentParser
2+
3+
__all__ = [
4+
"PptxDocumentParser",
5+
]
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .extractors import (
22
DEFAULT_EXTRACTORS,
3-
BaseExtractor,
3+
BasePptxExtractor,
44
HyperlinkExtractor,
55
ImageExtractor,
66
MetadataExtractor,
@@ -11,11 +11,11 @@
1111

1212
__all__ = [
1313
"DEFAULT_EXTRACTORS",
14-
"BaseExtractor",
15-
"HyperlinkExtractor",
16-
"ImageExtractor",
17-
"MetadataExtractor",
18-
"ShapeExtractor",
19-
"SpeakerNotesExtractor",
20-
"TextExtractor",
14+
"BasePptxExtractor",
15+
"PptxHyperlinkExtractor",
16+
"PptxImageExtractor",
17+
"PptxMetadataExtractor",
18+
"PptxShapeExtractor",
19+
"PptxSpeakerNotesExtractor",
20+
"PptxTextExtractor",
2121
]

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/dataclasses.py

Lines changed: 0 additions & 113 deletions
This file was deleted.

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/extractors.py

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,10 @@
99
from pptx.presentation import Presentation
1010
from pptx.slide import Slide
1111

12-
from ragbits.document_search.documents.element import Element
12+
from ragbits.document_search.documents.element import Element, ImageElement, TextElement
1313

14-
from .dataclasses import (
15-
ExtractedHyperlink,
16-
ExtractedImage,
17-
ExtractedMetadata,
18-
ExtractedShape,
19-
ExtractedSlideImage,
20-
ExtractedSpeakerNotes,
21-
ExtractedText,
22-
)
2314

24-
25-
class BaseExtractor(ABC):
15+
class BasePptxExtractor(ABC):
2616
"""Base class for all PPTX content extractors."""
2717

2818
@abstractmethod
@@ -36,10 +26,10 @@ def get_extractor_name(self) -> str:
3626
pass
3727

3828

39-
class TextExtractor(BaseExtractor):
29+
class PptxTextExtractor(BasePptxExtractor):
4030
"""Extracts text content with hierarchy, positioning, and formatting."""
4131

42-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedText]:
32+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
4333
"""Extract text content from all slides or a specific slide."""
4434
extracted_texts = []
4535

@@ -57,7 +47,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
5747
for run in paragraph.runs:
5848
if run.text.strip():
5949
extracted_texts.append(
60-
ExtractedText(
50+
TextElement(
6151
content=run.text,
6252
slide_index=slide_idx,
6353
shape_id=shape.shape_id,
@@ -122,10 +112,10 @@ def get_extractor_name(self) -> str:
122112
return "text"
123113

124114

125-
class HyperlinkExtractor(BaseExtractor):
115+
class PptxHyperlinkExtractor(BasePptxExtractor):
126116
"""Extracts hyperlinks from shapes and text runs."""
127117

128-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedHyperlink]:
118+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
129119
"""Extract hyperlinks from all slides or a specific slide."""
130120
extracted_hyperlinks = []
131121

@@ -142,7 +132,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
142132
hyperlink_info = self._extract_hyperlink_from_action(shape.click_action)
143133
if hyperlink_info:
144134
extracted_hyperlinks.append(
145-
ExtractedHyperlink(
135+
Element(
146136
url=hyperlink_info["url"],
147137
display_text=hyperlink_info["display_text"],
148138
slide_index=slide_idx,
@@ -159,7 +149,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
159149
for run in paragraph.runs:
160150
if hasattr(run, "hyperlink") and run.hyperlink and run.hyperlink.address:
161151
extracted_hyperlinks.append(
162-
ExtractedHyperlink(
152+
Element(
163153
url=run.hyperlink.address,
164154
display_text=run.text,
165155
slide_index=slide_idx,
@@ -199,10 +189,10 @@ def get_extractor_name(self) -> str:
199189
return "hyperlink"
200190

201191

202-
class ImageExtractor(BaseExtractor):
192+
class PptxImageExtractor(BasePptxExtractor):
203193
"""Extracts embedded images from slides."""
204194

205-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedImage]:
195+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
206196
"""Extract images from all slides or a specific slide."""
207197
extracted_images = []
208198

@@ -220,7 +210,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
220210
image_format = self._get_image_format(image_bytes)
221211

222212
extracted_images.append(
223-
ExtractedImage(
213+
ImageElement(
224214
image_bytes=image_bytes,
225215
slide_index=slide_idx,
226216
shape_id=shape.shape_id,
@@ -259,10 +249,10 @@ def get_extractor_name(self) -> str:
259249
return "image"
260250

261251

262-
class ShapeExtractor(BaseExtractor):
252+
class PptxShapeExtractor(BasePptxExtractor):
263253
"""Extracts shape information including positioning and styling."""
264254

265-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedShape]:
255+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
266256
"""Extract shapes from all slides or a specific slide."""
267257
extracted_shapes = []
268258

@@ -280,7 +270,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
280270
)
281271

282272
extracted_shapes.append(
283-
ExtractedShape(
273+
Element(
284274
shape_type=self._get_shape_type_name(shape.shape_type),
285275
slide_index=slide_idx,
286276
shape_id=shape.shape_id,
@@ -338,10 +328,10 @@ def get_extractor_name(self) -> str:
338328
return "shape"
339329

340330

341-
class SpeakerNotesExtractor(BaseExtractor):
331+
class PptxSpeakerNotesExtractor(BasePptxExtractor):
342332
"""Extracts speaker notes from slides."""
343333

344-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedSpeakerNotes]:
334+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
345335
"""Extract notes from all slides or a specific slide."""
346336
extracted_notes = []
347337

@@ -365,7 +355,7 @@ def extract(self, presentation: Presentation, slide: Slide | None = None) -> lis
365355
if notes_text.strip():
366356
formatting = self._extract_notes_formatting(notes_slide.notes_text_frame)
367357
extracted_notes.append(
368-
ExtractedSpeakerNotes(content=notes_text, slide_index=slide_idx, formatting=formatting)
358+
Element(content=notes_text, slide_index=slide_idx, formatting=formatting)
369359
)
370360

371361
return extracted_notes
@@ -392,10 +382,10 @@ def get_extractor_name(self) -> str:
392382
return "notes"
393383

394384

395-
class SlideImageExtractor(BaseExtractor):
385+
class PptxSlideImageExtractor(BasePptxExtractor):
396386
"""Extracts each slide as an image."""
397387

398-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedSlideImage]:
388+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
399389
"""Extract slides as images - placeholder implementation."""
400390
# Note: This would require additional libraries like python-pptx-interface
401391
# or conversion tools to render slides as images
@@ -406,14 +396,14 @@ def get_extractor_name(self) -> str:
406396
return "slide_image"
407397

408398

409-
class MetadataExtractor(BaseExtractor):
399+
class PptxMetadataExtractor(BasePptxExtractor):
410400
"""Extracts document metadata and properties."""
411401

412-
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[ExtractedMetadata]:
402+
def extract(self, presentation: Presentation, slide: Slide | None = None) -> list[Element]:
413403
"""Extract metadata from the presentation."""
414404
core_props = presentation.core_properties
415405

416-
metadata = ExtractedMetadata(
406+
metadata = Element(
417407
title=core_props.title,
418408
author=core_props.author,
419409
subject=core_props.subject,
@@ -446,13 +436,12 @@ def get_extractor_name(self) -> str:
446436
return "metadata"
447437

448438

449-
# Default list of extractors
450439
DEFAULT_EXTRACTORS = [
451-
TextExtractor(),
452-
HyperlinkExtractor(),
453-
ImageExtractor(),
454-
ShapeExtractor(),
455-
SpeakerNotesExtractor(),
456-
MetadataExtractor(),
457-
SlideImageExtractor(),
440+
PptxTextExtractor(),
441+
PptxHyperlinkExtractor(),
442+
PptxImageExtractor(),
443+
PptxShapeExtractor(),
444+
PptxSpeakerNotesExtractor(),
445+
PptxMetadataExtractor(),
446+
PptxSlideImageExtractor(),
458447
]

0 commit comments

Comments
 (0)