Skip to content

Commit d99ed3a

Browse files
committed
feat(pptx): enhance PPTX parser to extract metadata, slide previews, and non-text shapes
- Added extraction of document-level metadata (author, creation, modification dates). - Implemented slide title and subtitle descriptions. - Included optional rendering of slide previews as PNG images. - Enhanced handling of non-text shapes with descriptive text.
1 parent 52f42c7 commit d99ed3a

File tree

1 file changed

+113
-18
lines changed
  • packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers

1 file changed

+113
-18
lines changed
Lines changed: 113 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,64 @@
11
from __future__ import annotations
22

3+
from collections.abc import Iterable
4+
from pathlib import Path
5+
from tempfile import NamedTemporaryFile
6+
from typing import Any, Final, List
7+
8+
import aspose.pydrawing as drawing
9+
from aspose import slides
310
from pptx import Presentation
4-
from pptx.enum.shapes import MSO_SHAPE_TYPE
11+
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
512

613
from ragbits.document_search.documents.document import Document, DocumentType
7-
from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
14+
from ragbits.document_search.documents.element import (
15+
Element,
16+
ElementLocation,
17+
ImageElement,
18+
TextElement,
19+
)
820
from ragbits.document_search.ingestion.parsers.base import DocumentParser
921

1022

1123
class PptxDocumentParser(DocumentParser):
12-
"""Parser that extracts content from PPTX files using *python-pptx*.
24+
"""Parser that extracts rich content from PPTX files.
1325
14-
The parser retrieves text from all textual shapes, table cells and slide notes, as well as
15-
the binary bytes of pictures embedded in the presentation. Each piece of data is converted
16-
into a corresponding :class:`~ragbits.document_search.documents.element.TextElement` or
17-
:class:`~ragbits.document_search.documents.element.ImageElement`.
26+
Besides textual shapes, tables and speaker notes the parser also:
27+
* extracts embedded pictures, retaining *alt-text* as the description,
28+
* extracts generic non-text shapes as textual descriptions,
29+
* captures slide-level title / subtitle as a description element,
30+
* attaches document-level metadata (author, creation & modification dates),
31+
* optionally renders a PNG preview of each slide when a rendering backend
32+
is available (``aspose.slides`` preferred).
1833
"""
1934

20-
supported_document_types = {DocumentType.PPTX}
21-
22-
async def parse(self, document: Document) -> list[Element]:
23-
"""Parse the given PPTX document.
24-
25-
Args:
26-
document: The document to parse.
35+
supported_document_types: Final[set[DocumentType]] = {DocumentType.PPTX}
2736

28-
Returns:
29-
A list of extracted elements.
30-
"""
37+
async def parse(self, document: Document) -> List[Element]: # noqa: D401
38+
"""Parse the given PPTX document and return extracted elements."""
3139
self.validate_document_type(document.metadata.document_type)
40+
3241
presentation = Presentation(str(document.local_path))
33-
elements: list[Element] = []
42+
elements: List[Element] = []
43+
44+
# document-level metadata
45+
core_props = presentation.core_properties
46+
author = core_props.author or core_props.last_modified_by or "Unknown"
47+
created = core_props.created.isoformat() if core_props.created else "Unknown"
48+
modified = core_props.modified.isoformat() if core_props.modified else "Unknown"
49+
meta_text = f"Author: {author}\nCreated: {created}\nModified: {modified}"
50+
elements.append(
51+
TextElement(
52+
document_meta=document.metadata,
53+
location=None,
54+
content=meta_text,
55+
)
56+
)
3457

3558
for slide_idx, slide in enumerate(presentation.slides, start=1):
3659
slide_location = ElementLocation(page_number=slide_idx)
3760

61+
# textual shapes & table cells
3862
for shape in slide.shapes:
3963
if shape.has_text_frame:
4064
text = shape.text
@@ -46,6 +70,9 @@ async def parse(self, document: Document) -> list[Element]:
4670
content=text.strip(),
4771
)
4872
)
73+
continue
74+
75+
# table
4976
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
5077
for row in shape.table.rows:
5178
for cell in row.cells:
@@ -58,6 +85,9 @@ async def parse(self, document: Document) -> list[Element]:
5885
content=cell_text.strip(),
5986
)
6087
)
88+
continue
89+
90+
# picture
6191
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
6292
image_bytes = shape.image.blob
6393
description = getattr(shape, "alt_text", None) or None
@@ -69,7 +99,19 @@ async def parse(self, document: Document) -> list[Element]:
6999
description=description,
70100
)
71101
)
102+
continue
103+
104+
# generic non-text shape description
105+
shape_desc = f"Shape type: {shape.shape_type.name}, name: {shape.name}"
106+
elements.append(
107+
TextElement(
108+
document_meta=document.metadata,
109+
location=slide_location,
110+
content=shape_desc,
111+
)
112+
)
72113

114+
# speaker notes
73115
if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
74116
notes_text = slide.notes_slide.notes_text_frame.text
75117
if notes_text and notes_text.strip():
@@ -81,4 +123,57 @@ async def parse(self, document: Document) -> list[Element]:
81123
)
82124
)
83125

126+
# slide title / subtitle description
127+
title_text = slide.shapes.title.text if slide.shapes.title else ""
128+
subtitle_text = _extract_subtitle(slide.shapes)
129+
if title_text or subtitle_text:
130+
desc = f"Slide description: {title_text} {subtitle_text}".strip()
131+
elements.append(
132+
TextElement(
133+
document_meta=document.metadata,
134+
location=slide_location,
135+
content=desc,
136+
)
137+
)
138+
139+
# full slide preview (optional)
140+
preview_bytes = _render_slide_preview(document.local_path, slide_idx)
141+
if preview_bytes is not None:
142+
elements.append(
143+
ImageElement(
144+
document_meta=document.metadata,
145+
location=slide_location,
146+
image_bytes=preview_bytes,
147+
description=f"Slide {slide_idx} preview",
148+
)
149+
)
150+
84151
return elements
152+
153+
154+
def _extract_subtitle(shapes: Iterable[Any]) -> str:
155+
"""Return subtitle placeholder text if present."""
156+
for shape in shapes:
157+
if (
158+
shape.is_placeholder # type: ignore[attr-defined]
159+
and shape.placeholder_format.type # type: ignore[attr-defined]
160+
== PP_PLACEHOLDER.SUBTITLE
161+
and shape.has_text_frame
162+
):
163+
return shape.text
164+
return ""
165+
166+
167+
def _render_slide_preview(pptx_path: Path, slide_idx: int) -> bytes | None:
168+
"""Return a PNG rendering of *slide_idx* (1-based) or *None* if unavailable."""
169+
with slides.Presentation(str(pptx_path)) as pres:
170+
if slide_idx - 1 >= pres.slides.length:
171+
return None
172+
slide = pres.slides[slide_idx - 1]
173+
image = slide.get_thumbnail(2.0, 2.0)
174+
with NamedTemporaryFile(suffix=".png", delete=False) as tmp:
175+
image.save(tmp.name, drawing.imaging.ImageFormat.png)
176+
tmp.flush()
177+
png_data = Path(tmp.name).read_bytes()
178+
Path(tmp.name).unlink(missing_ok=True)
179+
return png_data

0 commit comments

Comments
 (0)