Skip to content

Commit e0e7537

Browse files
committed
fix(pptx): improve image and speaker notes extraction logic
- Updated shape filter in PptxImageExtractor to simplify condition for image presence. - Enhanced PptxSpeakerNotesExtractor to handle cases where notes_text_frame may be None, ensuring robust extraction. - Removed commented-out code for clarity and maintainability.
1 parent 29e0086 commit e0e7537

File tree

1 file changed

+3
-17
lines changed
  • packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors

1 file changed

+3
-17
lines changed

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/extractors.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
135135
presentation=presentation,
136136
document_meta=document_meta,
137137
slide=slide,
138-
shape_filter=lambda shape: hasattr(shape, 'image') and shape.image is not None,
138+
shape_filter=lambda shape: shape.image and shape.image is not None,
139139
content_extractor=lambda shape: f"Image: {shape.image.filename if hasattr(shape.image, 'filename') else 'embedded_image'}",
140140
element_type="image"
141141
)
@@ -209,9 +209,9 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
209209
if sld.has_notes_slide and sld.notes_slide.notes_text_frame is not None:
210210
notes_slide = sld.notes_slide
211211
notes_text_frame = notes_slide.notes_text_frame
212-
text = notes_text_frame.text.strip()
212+
text = notes_text_frame.text.strip() if notes_text_frame is not None else None
213213

214-
if text:
214+
if text and notes_text_frame is not None:
215215
coordinates = {
216216
"left": notes_text_frame.margin_left,
217217
"right": notes_text_frame.margin_right,
@@ -227,19 +227,6 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
227227
coordinates=coordinates
228228
)
229229
elements.append(element)
230-
231-
for shape in notes_slide.shapes:
232-
if shape.has_text_frame:
233-
text = str(shape.text_frame.text).strip()
234-
if text:
235-
element = self._create_text_element(
236-
element_type="speaker_notes",
237-
document_meta=document_meta,
238-
content=text,
239-
slide_idx=slide_idx,
240-
shape=shape
241-
)
242-
elements.append(element)
243230

244231
return elements
245232

@@ -255,5 +242,4 @@ def get_extractor_name(self) -> str:
255242
PptxShapeExtractor(),
256243
PptxSpeakerNotesExtractor(),
257244
PptxMetadataExtractor(),
258-
PptxSlideImageExtractor(),
259245
]

0 commit comments

Comments
 (0)