Skip to content

Commit 5538040

Browse files
committed
feat(pptx): integrate python-pptx library for enhanced PPTX parsing
- Added python-pptx as a dependency for improved PPTX file handling. - Updated the parser import path to reflect the new module structure. - Removed the obsolete pptx.py file to streamline the codebase.
1 parent e6a4fdf commit 5538040

File tree

5 files changed

+100
-99
lines changed

5 files changed

+100
-99
lines changed

packages/ragbits-document-search/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"rerankers>=0.6.1,<1.0.0",
3636
"filetype>=1.2.0,<2.0.0",
3737
"ragbits-core==1.1.0",
38+
"python-pptx>=1.0.0,<2.0.0",
3839
]
3940

4041
[project.urls]

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2-
from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
2+
from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
33
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
44

55
__all__ = [
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from dataclasses import dataclass
2+
from typing import Any
3+
4+
5+
@dataclass
6+
class PptxTextContent:
7+
"""
8+
Represents extracted text content with formatting information.
9+
"""
10+
11+
text: str
12+
font_name: str | None = None
13+
font_size: float | None = None
14+
is_bold: bool = False
15+
is_italic: bool = False
16+
is_underlined: bool = False
17+
color: str | None = None
18+
hyperlink_url: str | None = None
19+
hierarchy_level: int = 0
20+
21+
22+
@dataclass
23+
class PptxShapeInfo:
24+
"""
25+
Represents extracted shape information.
26+
"""
27+
28+
shape_type: str
29+
name: str
30+
left: float
31+
top: float
32+
width: float
33+
height: float
34+
rotation: float
35+
fill_color: str | None = None
36+
line_color: str | None = None
37+
text_content: str | None = None
38+
is_grouped: bool = False
39+
40+
41+
@dataclass
42+
class PptxImageInfo:
43+
"""
44+
Represents extracted image information.
45+
"""
46+
47+
image_data: bytes
48+
format: str
49+
width: int
50+
height: int
51+
left: float
52+
top: float
53+
description: str | None = None
54+
ocr_text: str | None = None
55+
56+
57+
@dataclass
58+
class PptxSlideInfo:
59+
"""
60+
Represents comprehensive slide information.
61+
"""
62+
63+
slide_number: int
64+
slide_image: bytes
65+
layout_type: str
66+
title: str | None = None
67+
content: str | None = None
68+
speaker_notes: str | None = None
69+
shapes: list[PptxShapeInfo] | None = None
70+
images: list[PptxImageInfo] | None = None
71+
72+
73+
@dataclass
74+
class PptxMetadata:
75+
"""
76+
Represents comprehensive document metadata.
77+
"""
78+
79+
title: str | None = None
80+
author: str | None = None
81+
subject: str | None = None
82+
keywords: str | None = None
83+
category: str | None = None
84+
comments: str | None = None
85+
created: str | None = None
86+
modified: str | None = None
87+
last_modified_by: str | None = None
88+
revision: int | None = None
89+
total_slides: int = 0
90+
custom_properties: dict[str, Any] | None = None

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx.py renamed to packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/pptx.py

Lines changed: 6 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,6 @@
1-
"""
2-
Comprehensive PPTX parser using python-pptx library.
3-
4-
This module provides a complete solution for parsing PowerPoint files, extracting
5-
text, shapes, images, slides, and metadata with memory-efficient processing.
6-
"""
7-
81
import io
92
import logging
10-
from dataclasses import dataclass
113
from pathlib import Path
12-
from typing import Any
134

145
import cv2
156
import numpy as np
@@ -25,101 +16,18 @@
2516
from ragbits.document_search.documents.document import Document, DocumentType
2617
from ragbits.document_search.documents.element import Element, ElementLocation, ImageElement, TextElement
2718
from ragbits.document_search.ingestion.parsers.base import DocumentParser
19+
from ragbits.document_search.ingestion.parsers.pptx.dataclasses import (
20+
PptxImageInfo,
21+
PptxMetadata,
22+
PptxShapeInfo,
23+
PptxSlideInfo,
24+
)
2825

2926
logger = logging.getLogger(__name__)
3027

31-
# Constants
3228
MAX_TEXT_PREVIEW_LENGTH = 200
3329

3430

35-
@dataclass
36-
class PptxTextContent:
37-
"""
38-
Represents extracted text content with formatting information.
39-
"""
40-
41-
text: str
42-
font_name: str | None = None
43-
font_size: float | None = None
44-
is_bold: bool = False
45-
is_italic: bool = False
46-
is_underlined: bool = False
47-
color: str | None = None
48-
hyperlink_url: str | None = None
49-
hierarchy_level: int = 0
50-
51-
52-
@dataclass
53-
class PptxShapeInfo:
54-
"""
55-
Represents extracted shape information.
56-
"""
57-
58-
shape_type: str
59-
name: str
60-
left: float
61-
top: float
62-
width: float
63-
height: float
64-
rotation: float
65-
fill_color: str | None = None
66-
line_color: str | None = None
67-
text_content: str | None = None
68-
is_grouped: bool = False
69-
70-
71-
@dataclass
72-
class PptxImageInfo:
73-
"""
74-
Represents extracted image information.
75-
"""
76-
77-
image_data: bytes
78-
format: str
79-
width: int
80-
height: int
81-
left: float
82-
top: float
83-
description: str | None = None
84-
ocr_text: str | None = None
85-
86-
87-
@dataclass
88-
class PptxSlideInfo:
89-
"""
90-
Represents comprehensive slide information.
91-
"""
92-
93-
slide_number: int
94-
slide_image: bytes
95-
layout_type: str
96-
title: str | None = None
97-
content: str | None = None
98-
speaker_notes: str | None = None
99-
shapes: list[PptxShapeInfo] | None = None
100-
images: list[PptxImageInfo] | None = None
101-
102-
103-
@dataclass
104-
class PptxMetadata:
105-
"""
106-
Represents comprehensive document metadata.
107-
"""
108-
109-
title: str | None = None
110-
author: str | None = None
111-
subject: str | None = None
112-
keywords: str | None = None
113-
category: str | None = None
114-
comments: str | None = None
115-
created: str | None = None
116-
modified: str | None = None
117-
last_modified_by: str | None = None
118-
revision: int | None = None
119-
total_slides: int = 0
120-
custom_properties: dict[str, Any] | None = None
121-
122-
12331
class PptxDocumentParser(DocumentParser):
12432
"""
12533
Comprehensive PPTX parser that extracts text, shapes, images, slides, and metadata.

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)