Skip to content

Commit 2be019b

Browse files
committed
feat: add PPTX document parser and update dependencies
- Introduced PptxDocumentParser for extracting content from PPTX files. - Updated pyproject.toml to include python-pptx as a dependency. - Modified the router to use PptxDocumentParser for PPTX document types. - Cleaned up formatting in pyproject.toml and __init__.py for consistency.
1 parent 01279ad commit 2be019b

File tree

4 files changed

+105
-10
lines changed

4 files changed

+105
-10
lines changed

packages/ragbits-document-search/pyproject.toml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,15 @@ description = "Document Search module for Ragbits"
55
readme = "README.md"
66
requires-python = ">=3.10"
77
license = "MIT"
8-
authors = [
9-
{ name = "deepsense.ai", email = "[email protected]"}
10-
]
8+
authors = [{ name = "deepsense.ai", email = "[email protected]" }]
119
keywords = [
1210
"Retrieval Augmented Generation",
1311
"RAG",
1412
"Large Language Models",
1513
"LLMs",
1614
"Generative AI",
1715
"GenAI",
18-
"Document Search"
16+
"Document Search",
1917
]
2018
classifiers = [
2119
"Development Status :: 4 - Beta",
@@ -31,7 +29,14 @@ classifiers = [
3129
"Topic :: Scientific/Engineering :: Artificial Intelligence",
3230
"Topic :: Software Development :: Libraries :: Python Modules",
3331
]
34-
dependencies = ["docling>=2.15.1,<3.0.0", "opencv-python>=4.11.0.86,<5.0.0.0", "rerankers>=0.6.1,<1.0.0", "filetype>=1.2.0,<2.0.0", "ragbits-core==1.0.0"]
32+
dependencies = [
33+
"docling>=2.15.1,<3.0.0",
34+
"opencv-python>=4.11.0.86,<5.0.0.0",
35+
"rerankers>=0.6.1,<1.0.0",
36+
"filetype>=1.2.0,<2.0.0",
37+
"ragbits-core==1.0.0",
38+
"python-pptx>=0.6.23,<1.0.0",
39+
]
3540

3641
[project.urls]
3742
"Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -44,9 +49,7 @@ unstructured = [
4449
"unstructured>=0.16.9,<1.0.0",
4550
"unstructured-client>=0.26.0,<1.0.0",
4651
]
47-
ray = [
48-
"ray[data]>=2.43.0,<3.0.0",
49-
]
52+
ray = ["ray[data]>=2.43.0,<3.0.0"]
5053

5154
[tool.uv]
5255
dev-dependencies = [
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2+
from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
23
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
34

4-
__all__ = ["DocumentParser", "DocumentParserRouter", "ImageDocumentParser", "TextDocumentParser"]
5+
__all__ = [
6+
"DocumentParser",
7+
"DocumentParserRouter",
8+
"ImageDocumentParser",
9+
"PptxDocumentParser",
10+
"TextDocumentParser",
11+
]
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from __future__ import annotations
2+
3+
from pptx import Presentation
4+
from pptx.enum.shapes import MSO_SHAPE_TYPE
5+
6+
from ragbits.document_search.documents.document import Document, DocumentType
7+
from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
8+
from ragbits.document_search.ingestion.parsers.base import DocumentParser
9+
10+
11+
class PptxDocumentParser(DocumentParser):
12+
"""Parser that extracts content from PPTX files using *python-pptx*.
13+
14+
The parser retrieves text from all textual shapes, table cells and slide notes, as well as
15+
the binary bytes of pictures embedded in the presentation. Each piece of data is converted
16+
into a corresponding :class:`~ragbits.document_search.documents.element.TextElement` or
17+
:class:`~ragbits.document_search.documents.element.ImageElement`.
18+
"""
19+
20+
supported_document_types = {DocumentType.PPTX}
21+
22+
async def parse(self, document: Document) -> list[Element]:
23+
"""Parse the given PPTX document.
24+
25+
Args:
26+
document: The document to parse.
27+
28+
Returns:
29+
A list of extracted elements.
30+
"""
31+
self.validate_document_type(document.metadata.document_type)
32+
presentation = Presentation(str(document.local_path))
33+
elements: list[Element] = []
34+
35+
for slide_idx, slide in enumerate(presentation.slides, start=1):
36+
slide_location = ElementLocation(page_number=slide_idx)
37+
38+
for shape in slide.shapes:
39+
if shape.has_text_frame:
40+
text = shape.text
41+
if text and text.strip():
42+
elements.append(
43+
TextElement(
44+
document_meta=document.metadata,
45+
location=slide_location,
46+
content=text.strip(),
47+
)
48+
)
49+
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
50+
for row in shape.table.rows:
51+
for cell in row.cells:
52+
cell_text = cell.text
53+
if cell_text and cell_text.strip():
54+
elements.append(
55+
TextElement(
56+
document_meta=document.metadata,
57+
location=slide_location,
58+
content=cell_text.strip(),
59+
)
60+
)
61+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
62+
image_bytes = shape.image.blob
63+
description = getattr(shape, "alt_text", None) or None
64+
elements.append(
65+
ImageElement(
66+
document_meta=document.metadata,
67+
location=slide_location,
68+
image_bytes=image_bytes,
69+
description=description,
70+
)
71+
)
72+
73+
if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
74+
notes_text = slide.notes_slide.notes_text_frame.text
75+
if notes_text and notes_text.strip():
76+
elements.append(
77+
TextElement(
78+
document_meta=document.metadata,
79+
location=slide_location,
80+
content=notes_text.strip(),
81+
)
82+
)
83+
84+
return elements

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from ragbits.document_search.ingestion.parsers.base import DocumentParser
99
from ragbits.document_search.ingestion.parsers.docling import DoclingDocumentParser
1010
from ragbits.document_search.ingestion.parsers.exceptions import ParserNotFoundError
11+
from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
1112

1213
_default_parser = DoclingDocumentParser()
1314

@@ -16,7 +17,7 @@
1617
DocumentType.MD: _default_parser,
1718
DocumentType.PDF: _default_parser,
1819
DocumentType.DOCX: _default_parser,
19-
DocumentType.PPTX: _default_parser,
20+
DocumentType.PPTX: PptxDocumentParser(),
2021
DocumentType.XLSX: _default_parser,
2122
DocumentType.HTML: _default_parser,
2223
DocumentType.JPG: _default_parser,

0 commit comments

Comments
 (0)