fix(pptx): improve image and speaker notes extraction logic

maxpill · maxpill · commit e0e753775015 · 2025-07-11T16:40:56.000+02:00
- Updated shape filter in PptxImageExtractor to simplify condition for image presence.
- Enhanced PptxSpeakerNotesExtractor to handle cases where notes_text_frame may be None, ensuring robust extraction.
- Removed commented-out code for clarity and maintainability.
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/extractors.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/extractors.py
@@ -135,7 +135,7 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
             presentation=presentation,
             document_meta=document_meta,
             slide=slide,
-            shape_filter=lambda shape: hasattr(shape, 'image') and shape.image is not None,
+            shape_filter=lambda shape: shape.image and shape.image is not None,
             content_extractor=lambda shape: f"Image: {shape.image.filename if hasattr(shape.image, 'filename') else 'embedded_image'}",
             element_type="image"
         )
@@ -209,9 +209,9 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
             if sld.has_notes_slide and sld.notes_slide.notes_text_frame is not None:
                 notes_slide = sld.notes_slide
                 notes_text_frame = notes_slide.notes_text_frame
-                text = notes_text_frame.text.strip()
+                text = notes_text_frame.text.strip() if notes_text_frame is not None else None
                 
-                if text:
+                if text and notes_text_frame is not None:
                     coordinates = {
                         "left": notes_text_frame.margin_left,
                         "right": notes_text_frame.margin_right,
@@ -227,19 +227,6 @@ def extract(self, presentation: Presentation, document_meta: DocumentMeta, slide
                         coordinates=coordinates
                     )
                     elements.append(element)
-                
-                for shape in notes_slide.shapes:
-                    if shape.has_text_frame:
-                        text = str(shape.text_frame.text).strip()
-                        if text:
-                            element = self._create_text_element(
-                                element_type="speaker_notes",
-                                document_meta=document_meta,
-                                content=text,
-                                slide_idx=slide_idx,
-                                shape=shape
-                            )
-                            elements.append(element)
 
         return elements
 
@@ -255,5 +242,4 @@ def get_extractor_name(self) -> str:
     PptxShapeExtractor(),
     PptxSpeakerNotesExtractor(),
     PptxMetadataExtractor(),
-    PptxSlideImageExtractor(),
 ]