Skip to content

Commit 27a641f

Browse files
committed
feat(pptx): add temporary testing script and enhance PPTX parser
- Introduced a new script for creating dummy PPTX files to facilitate testing of the PPTX parser. - Updated the PptxDocumentParser to utilize DocumentMeta for improved document handling. - Refactored extractor classes to enhance clarity and maintainability, including renaming to follow a consistent naming convention. - Improved extraction methods for text, hyperlinks, images, shapes, metadata, and speaker notes.
1 parent 6727cce commit 27a641f

File tree

4 files changed

+210
-80
lines changed

4 files changed

+210
-80
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# This is a temporary script for development purposes and PR testing.
2+
# It will be removed before merging.
3+
4+
from __future__ import annotations
5+
6+
import asyncio
7+
import os
8+
from pathlib import Path
9+
10+
from pptx import Presentation
11+
from pptx.util import Inches
12+
13+
from ragbits.core.sources.local import LocalFileSource
14+
from ragbits.document_search.documents.document import Document, DocumentMeta, DocumentType
15+
from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
16+
17+
18+
async def create_dummy_pptx(file_path: str):
19+
"""Creates a dummy PPTX file for testing."""
20+
prs = Presentation()
21+
22+
# Slide 1: Title Slide
23+
title_slide_layout = prs.slide_layouts[0]
24+
slide1 = prs.slides.add_slide(title_slide_layout)
25+
title = slide1.shapes.title
26+
subtitle = slide1.placeholders[1]
27+
if title and title.has_text_frame:
28+
title.text_frame.text = "Test Presentation"
29+
if subtitle and subtitle.has_text_frame:
30+
subtitle.text_frame.text = "A presentation for testing the PPTX parser."
31+
32+
# Slide 2: Text, Shape, and Hyperlink
33+
bullet_slide_layout = prs.slide_layouts[1]
34+
slide2 = prs.slides.add_slide(bullet_slide_layout)
35+
shapes = slide2.shapes
36+
title_shape = shapes.title
37+
if title_shape and title_shape.has_text_frame:
38+
title_shape.text_frame.text = "This is a slide with text, a shape, and a hyperlink."
39+
40+
body_shape = shapes.placeholders[1]
41+
if body_shape and body_shape.has_text_frame:
42+
tf = body_shape.text_frame
43+
tf.text = "This is a bullet point."
44+
45+
p = tf.add_paragraph()
46+
p.text = "This is a line with a "
47+
r = p.add_run()
48+
r.text = "hyperlink"
49+
if r.hyperlink:
50+
r.hyperlink.address = "https://www.google.com"
51+
52+
# Slide 3: Image
53+
img_slide_layout = prs.slide_layouts[5]
54+
slide3 = prs.slides.add_slide(img_slide_layout)
55+
img_path = "packages/ragbits-core/tests/assets/img/test.png"
56+
if os.path.exists(img_path):
57+
left = top = Inches(1)
58+
slide3.shapes.add_picture(img_path, left, top)
59+
60+
# Slide 4: With speaker notes
61+
notes_slide_layout = prs.slide_layouts[1]
62+
slide4 = prs.slides.add_slide(notes_slide_layout)
63+
if slide4.has_notes_slide:
64+
notes_slide = slide4.notes_slide
65+
if notes_slide.notes_text_frame:
66+
text_frame = notes_slide.notes_text_frame
67+
text_frame.text = "These are speaker notes for slide 4."
68+
69+
prs.save(file_path)
70+
71+
72+
async def main():
73+
"""Main function to test the PPTX parser."""
74+
pptx_file = "test_pptx.pptx"
75+
await create_dummy_pptx(pptx_file)
76+
77+
try:
78+
document_meta = DocumentMeta(
79+
document_type=DocumentType.PPTX,
80+
source=LocalFileSource(path=Path(pptx_file)),
81+
)
82+
document = Document.from_document_meta(document_meta, Path(pptx_file))
83+
84+
parser = PptxDocumentParser()
85+
elements = await parser.parse(document)
86+
87+
print(f"--- Extracted {len(elements)} elements ---")
88+
for element in elements:
89+
print(f"Type: {element.element_type}")
90+
print(f"Content: {element.text_representation}")
91+
print(f"Location: {element.location}")
92+
print("-" * 20)
93+
94+
except Exception as e:
95+
print(f"Error: {e}")
96+
97+
98+
if __name__ == "__main__":
99+
asyncio.run(main())

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from .extractors import (
22
DEFAULT_EXTRACTORS,
33
BasePptxExtractor,
4-
HyperlinkExtractor,
5-
ImageExtractor,
6-
MetadataExtractor,
7-
ShapeExtractor,
8-
SpeakerNotesExtractor,
9-
TextExtractor,
4+
PptxHyperlinkExtractor,
5+
PptxImageExtractor,
6+
PptxMetadataExtractor,
7+
PptxShapeExtractor,
8+
PptxSpeakerNotesExtractor,
9+
PptxTextExtractor,
1010
)
1111

1212
__all__ = [

0 commit comments

Comments
 (0)