1
1
from __future__ import annotations
2
2
3
+ from collections .abc import Iterable
4
+ from pathlib import Path
5
+ from tempfile import NamedTemporaryFile
6
+ from typing import Any , Final , List
7
+
8
+ import aspose .pydrawing as drawing
9
+ from aspose import slides
3
10
from pptx import Presentation
4
- from pptx .enum .shapes import MSO_SHAPE_TYPE
11
+ from pptx .enum .shapes import MSO_SHAPE_TYPE , PP_PLACEHOLDER
5
12
6
13
from ragbits .document_search .documents .document import Document , DocumentType
7
- from ragbits .document_search .documents .element import Element , ElementLocation , ImageElement , TextElement
14
+ from ragbits .document_search .documents .element import (
15
+ Element ,
16
+ ElementLocation ,
17
+ ImageElement ,
18
+ TextElement ,
19
+ )
8
20
from ragbits .document_search .ingestion .parsers .base import DocumentParser
9
21
10
22
11
23
class PptxDocumentParser (DocumentParser ):
12
- """Parser that extracts content from PPTX files using *python-pptx* .
24
+ """Parser that extracts rich content from PPTX files.
13
25
14
- The parser retrieves text from all textual shapes, table cells and slide notes, as well as
15
- the binary bytes of pictures embedded in the presentation. Each piece of data is converted
16
- into a corresponding :class:`~ragbits.document_search.documents.element.TextElement` or
17
- :class:`~ragbits.document_search.documents.element.ImageElement`.
26
+ Besides textual shapes, tables and speaker notes the parser also:
27
+ * extracts embedded pictures, retaining *alt-text* as the description,
28
+ * extracts generic non-text shapes as textual descriptions,
29
+ * captures slide-level title / subtitle as a description element,
30
+ * attaches document-level metadata (author, creation & modification dates),
31
+ * optionally renders a PNG preview of each slide when a rendering backend
32
+ is available (``aspose.slides`` preferred).
18
33
"""
19
34
20
- supported_document_types = {DocumentType .PPTX }
21
-
22
- async def parse (self , document : Document ) -> list [Element ]:
23
- """Parse the given PPTX document.
24
-
25
- Args:
26
- document: The document to parse.
35
+ supported_document_types : Final [set [DocumentType ]] = {DocumentType .PPTX }
27
36
28
- Returns:
29
- A list of extracted elements.
30
- """
37
+ async def parse (self , document : Document ) -> List [Element ]: # noqa: D401
38
+ """Parse the given PPTX document and return extracted elements."""
31
39
self .validate_document_type (document .metadata .document_type )
40
+
32
41
presentation = Presentation (str (document .local_path ))
33
- elements : list [Element ] = []
42
+ elements : List [Element ] = []
43
+
44
+ # document-level metadata
45
+ core_props = presentation .core_properties
46
+ author = core_props .author or core_props .last_modified_by or "Unknown"
47
+ created = core_props .created .isoformat () if core_props .created else "Unknown"
48
+ modified = core_props .modified .isoformat () if core_props .modified else "Unknown"
49
+ meta_text = f"Author: { author } \n Created: { created } \n Modified: { modified } "
50
+ elements .append (
51
+ TextElement (
52
+ document_meta = document .metadata ,
53
+ location = None ,
54
+ content = meta_text ,
55
+ )
56
+ )
34
57
35
58
for slide_idx , slide in enumerate (presentation .slides , start = 1 ):
36
59
slide_location = ElementLocation (page_number = slide_idx )
37
60
61
+ # textual shapes & table cells
38
62
for shape in slide .shapes :
39
63
if shape .has_text_frame :
40
64
text = shape .text
@@ -46,6 +70,9 @@ async def parse(self, document: Document) -> list[Element]:
46
70
content = text .strip (),
47
71
)
48
72
)
73
+ continue
74
+
75
+ # table
49
76
if shape .shape_type == MSO_SHAPE_TYPE .TABLE :
50
77
for row in shape .table .rows :
51
78
for cell in row .cells :
@@ -58,6 +85,9 @@ async def parse(self, document: Document) -> list[Element]:
58
85
content = cell_text .strip (),
59
86
)
60
87
)
88
+ continue
89
+
90
+ # picture
61
91
if shape .shape_type == MSO_SHAPE_TYPE .PICTURE :
62
92
image_bytes = shape .image .blob
63
93
description = getattr (shape , "alt_text" , None ) or None
@@ -69,7 +99,19 @@ async def parse(self, document: Document) -> list[Element]:
69
99
description = description ,
70
100
)
71
101
)
102
+ continue
103
+
104
+ # generic non-text shape description
105
+ shape_desc = f"Shape type: { shape .shape_type .name } , name: { shape .name } "
106
+ elements .append (
107
+ TextElement (
108
+ document_meta = document .metadata ,
109
+ location = slide_location ,
110
+ content = shape_desc ,
111
+ )
112
+ )
72
113
114
+ # speaker notes
73
115
if slide .has_notes_slide and slide .notes_slide .notes_text_frame is not None :
74
116
notes_text = slide .notes_slide .notes_text_frame .text
75
117
if notes_text and notes_text .strip ():
@@ -81,4 +123,57 @@ async def parse(self, document: Document) -> list[Element]:
81
123
)
82
124
)
83
125
126
+ # slide title / subtitle description
127
+ title_text = slide .shapes .title .text if slide .shapes .title else ""
128
+ subtitle_text = _extract_subtitle (slide .shapes )
129
+ if title_text or subtitle_text :
130
+ desc = f"Slide description: { title_text } { subtitle_text } " .strip ()
131
+ elements .append (
132
+ TextElement (
133
+ document_meta = document .metadata ,
134
+ location = slide_location ,
135
+ content = desc ,
136
+ )
137
+ )
138
+
139
+ # full slide preview (optional)
140
+ preview_bytes = _render_slide_preview (document .local_path , slide_idx )
141
+ if preview_bytes is not None :
142
+ elements .append (
143
+ ImageElement (
144
+ document_meta = document .metadata ,
145
+ location = slide_location ,
146
+ image_bytes = preview_bytes ,
147
+ description = f"Slide { slide_idx } preview" ,
148
+ )
149
+ )
150
+
84
151
return elements
152
+
153
+
154
+ def _extract_subtitle (shapes : Iterable [Any ]) -> str :
155
+ """Return subtitle placeholder text if present."""
156
+ for shape in shapes :
157
+ if (
158
+ shape .is_placeholder # type: ignore[attr-defined]
159
+ and shape .placeholder_format .type # type: ignore[attr-defined]
160
+ == PP_PLACEHOLDER .SUBTITLE
161
+ and shape .has_text_frame
162
+ ):
163
+ return shape .text
164
+ return ""
165
+
166
+
167
+ def _render_slide_preview (pptx_path : Path , slide_idx : int ) -> bytes | None :
168
+ """Return a PNG rendering of *slide_idx* (1-based) or *None* if unavailable."""
169
+ with slides .Presentation (str (pptx_path )) as pres :
170
+ if slide_idx - 1 >= pres .slides .length :
171
+ return None
172
+ slide = pres .slides [slide_idx - 1 ]
173
+ image = slide .get_thumbnail (2.0 , 2.0 )
174
+ with NamedTemporaryFile (suffix = ".png" , delete = False ) as tmp :
175
+ image .save (tmp .name , drawing .imaging .ImageFormat .png )
176
+ tmp .flush ()
177
+ png_data = Path (tmp .name ).read_bytes ()
178
+ Path (tmp .name ).unlink (missing_ok = True )
179
+ return png_data
0 commit comments