feat(pptx): enhance PPTX parser to extract metadata, slide previews, and non-text shapes

maxpill · maxpill · commit d99ed3aa681c · 2025-07-11T10:42:42.000+02:00
- Added extraction of document-level metadata (author, creation, modification dates).
- Implemented slide title and subtitle descriptions.
- Included optional rendering of slide previews as PNG images.
- Enhanced handling of non-text shapes with descriptive text.
diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx.py
@@ -1,40 +1,64 @@
 from __future__ import annotations
 
+from collections.abc import Iterable
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Any, Final, List
+
+import aspose.pydrawing as drawing
+from aspose import slides
 from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE
+from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
 
 from ragbits.document_search.documents.document import Document, DocumentType
-from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
+from ragbits.document_search.documents.element import (
+    Element,
+    ElementLocation,
+    ImageElement,
+    TextElement,
+)
 from ragbits.document_search.ingestion.parsers.base import DocumentParser
 
 
 class PptxDocumentParser(DocumentParser):
-    """Parser that extracts content from PPTX files using *python-pptx*.
+    """Parser that extracts rich content from PPTX files.
 
-    The parser retrieves text from all textual shapes, table cells and slide notes, as well as
-    the binary bytes of pictures embedded in the presentation. Each piece of data is converted
-    into a corresponding :class:`~ragbits.document_search.documents.element.TextElement` or
-    :class:`~ragbits.document_search.documents.element.ImageElement`.
+    Besides textual shapes, tables and speaker notes the parser also:
+    * extracts embedded pictures, retaining *alt-text* as the description,
+    * extracts generic non-text shapes as textual descriptions,
+    * captures slide-level title / subtitle as a description element,
+    * attaches document-level metadata (author, creation & modification dates),
+    * optionally renders a PNG preview of each slide when a rendering backend
+      is available (``aspose.slides`` preferred).
     """
 
-    supported_document_types = {DocumentType.PPTX}
-
-    async def parse(self, document: Document) -> list[Element]:
-        """Parse the given PPTX document.
-
-        Args:
-            document: The document to parse.
+    supported_document_types: Final[set[DocumentType]] = {DocumentType.PPTX}
 
-        Returns:
-            A list of extracted elements.
-        """
+    async def parse(self, document: Document) -> List[Element]:  # noqa: D401
+        """Parse the given PPTX document and return extracted elements."""
         self.validate_document_type(document.metadata.document_type)
+
         presentation = Presentation(str(document.local_path))
-        elements: list[Element] = []
+        elements: List[Element] = []
+
+        # document-level metadata
+        core_props = presentation.core_properties
+        author = core_props.author or core_props.last_modified_by or "Unknown"
+        created = core_props.created.isoformat() if core_props.created else "Unknown"
+        modified = core_props.modified.isoformat() if core_props.modified else "Unknown"
+        meta_text = f"Author: {author}\nCreated: {created}\nModified: {modified}"
+        elements.append(
+            TextElement(
+                document_meta=document.metadata,
+                location=None,
+                content=meta_text,
+            )
+        )
 
         for slide_idx, slide in enumerate(presentation.slides, start=1):
             slide_location = ElementLocation(page_number=slide_idx)
 
+            # textual shapes & table cells
             for shape in slide.shapes:
                 if shape.has_text_frame:
                     text = shape.text
@@ -46,6 +70,9 @@ async def parse(self, document: Document) -> list[Element]:
                                 content=text.strip(),
                             )
                         )
+                    continue
+
+                # table
                 if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
                     for row in shape.table.rows:
                         for cell in row.cells:
@@ -58,6 +85,9 @@ async def parse(self, document: Document) -> list[Element]:
                                         content=cell_text.strip(),
                                     )
                                 )
+                    continue
+
+                # picture
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                     image_bytes = shape.image.blob
                     description = getattr(shape, "alt_text", None) or None
@@ -69,7 +99,19 @@ async def parse(self, document: Document) -> list[Element]:
                             description=description,
                         )
                     )
+                    continue
+
+                # generic non-text shape description
+                shape_desc = f"Shape type: {shape.shape_type.name}, name: {shape.name}"
+                elements.append(
+                    TextElement(
+                        document_meta=document.metadata,
+                        location=slide_location,
+                        content=shape_desc,
+                    )
+                )
 
+            # speaker notes
             if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
                 notes_text = slide.notes_slide.notes_text_frame.text
                 if notes_text and notes_text.strip():
@@ -81,4 +123,57 @@ async def parse(self, document: Document) -> list[Element]:
                         )
                     )
 
+            # slide title / subtitle description
+            title_text = slide.shapes.title.text if slide.shapes.title else ""
+            subtitle_text = _extract_subtitle(slide.shapes)
+            if title_text or subtitle_text:
+                desc = f"Slide description: {title_text} {subtitle_text}".strip()
+                elements.append(
+                    TextElement(
+                        document_meta=document.metadata,
+                        location=slide_location,
+                        content=desc,
+                    )
+                )
+
+            # full slide preview (optional)
+            preview_bytes = _render_slide_preview(document.local_path, slide_idx)
+            if preview_bytes is not None:
+                elements.append(
+                    ImageElement(
+                        document_meta=document.metadata,
+                        location=slide_location,
+                        image_bytes=preview_bytes,
+                        description=f"Slide {slide_idx} preview",
+                    )
+                )
+
         return elements
+
+
+def _extract_subtitle(shapes: Iterable[Any]) -> str:
+    """Return subtitle placeholder text if present."""
+    for shape in shapes:
+        if (
+            shape.is_placeholder  # type: ignore[attr-defined]
+            and shape.placeholder_format.type  # type: ignore[attr-defined]
+            == PP_PLACEHOLDER.SUBTITLE
+            and shape.has_text_frame
+        ):
+            return shape.text
+    return ""
+
+
+def _render_slide_preview(pptx_path: Path, slide_idx: int) -> bytes | None:
+    """Return a PNG rendering of *slide_idx* (1-based) or *None* if unavailable."""
+    with slides.Presentation(str(pptx_path)) as pres:
+        if slide_idx - 1 >= pres.slides.length:
+            return None
+        slide = pres.slides[slide_idx - 1]
+        image = slide.get_thumbnail(2.0, 2.0)
+        with NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            image.save(tmp.name, drawing.imaging.ImageFormat.png)
+            tmp.flush()
+            png_data = Path(tmp.name).read_bytes()
+        Path(tmp.name).unlink(missing_ok=True)
+        return png_data