Skip to content

Commit d3e21c0

Browse files
committed
feat: add python-pptx dependency and update parser imports
- Added python-pptx to dependencies in uv.lock and updated its version constraint in pyproject.toml. - Refactored import paths for PptxDocumentParser in the ingestion parsers to improve module structure.
1 parent db36d39 commit d3e21c0

File tree

7 files changed

+1057
-3
lines changed

7 files changed

+1057
-3
lines changed

packages/ragbits-document-search/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ dependencies = [
3535
"rerankers>=0.6.1,<1.0.0",
3636
"filetype>=1.2.0,<2.0.0",
3737
"ragbits-core==1.1.0",
38-
"python-pptx>=0.6.21,<1.0.0",
38+
"python-pptx>=1.0.0,<2.0.0",
3939
]
4040

4141
[project.urls]

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from ragbits.document_search.ingestion.parsers.base import DocumentParser, ImageDocumentParser, TextDocumentParser
2-
from ragbits.document_search.ingestion.parsers.pptx_parser import PptxDocumentParser
2+
from ragbits.document_search.ingestion.parsers.pptx import PptxDocumentParser
33
from ragbits.document_search.ingestion.parsers.router import DocumentParserRouter
44

55
__all__ = [

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from ragbits.document_search.ingestion.parsers.pptx.parser import PptxDocumentParser
1+
from .parser import PptxDocumentParser
22

33
__all__ = [
44
"PptxDocumentParser",
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from .dataclasses import (
2+
ExtractedHyperlink,
3+
ExtractedImage,
4+
ExtractedMetadata,
5+
ExtractedShape,
6+
ExtractedSlideImage,
7+
ExtractedSpeakerNotes,
8+
ExtractedText,
9+
)
10+
from .extractors import (
11+
BaseExtractor,
12+
HyperlinkExtractor,
13+
ImageExtractor,
14+
MetadataExtractor,
15+
ShapeExtractor,
16+
SpeakerNotesExtractor,
17+
TextExtractor,
18+
)
19+
20+
__all__ = [
21+
"BaseExtractor",
22+
"ExtractedHyperlink",
23+
"ExtractedImage",
24+
"ExtractedMetadata",
25+
"ExtractedShape",
26+
"ExtractedSlideImage",
27+
"ExtractedSpeakerNotes",
28+
"ExtractedText",
29+
"HyperlinkExtractor",
30+
"ImageExtractor",
31+
"MetadataExtractor",
32+
"ShapeExtractor",
33+
"SpeakerNotesExtractor",
34+
"TextExtractor",
35+
]
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass, field
4+
from datetime import datetime
5+
from typing import Any
6+
7+
8+
@dataclass
9+
class ExtractedHyperlink:
10+
"""Represents a hyperlink extracted from a PPTX file."""
11+
12+
url: str
13+
display_text: str
14+
slide_index: int
15+
shape_id: int | None = None
16+
shape_name: str | None = None
17+
is_internal: bool = False
18+
target_slide_index: int | None = None
19+
20+
21+
@dataclass
22+
class ExtractedText:
23+
"""Represents text content extracted from a PPTX file."""
24+
25+
content: str
26+
slide_index: int
27+
shape_id: int | None = None
28+
shape_name: str | None = None
29+
hierarchy_level: int = 0 # 0=title, 1=subtitle, 2=body, etc.
30+
font_size: float | None = None
31+
is_bold: bool = False
32+
is_italic: bool = False
33+
is_underline: bool = False
34+
font_name: str | None = None
35+
color_rgb: str | None = None
36+
left: int | None = None
37+
top: int | None = None
38+
width: int | None = None
39+
height: int | None = None
40+
41+
42+
@dataclass
43+
class ExtractedImage:
44+
"""Represents an image extracted from a PPTX file."""
45+
46+
image_bytes: bytes
47+
slide_index: int
48+
shape_id: int | None = None
49+
shape_name: str | None = None
50+
format: str | None = None
51+
width: int | None = None
52+
height: int | None = None
53+
left: int | None = None
54+
top: int | None = None
55+
56+
57+
@dataclass
58+
class ExtractedShape:
59+
"""Represents a shape extracted from a PPTX file."""
60+
61+
shape_type: str
62+
slide_index: int
63+
shape_id: int | None = None
64+
shape_name: str | None = None
65+
left: int | None = None
66+
top: int | None = None
67+
width: int | None = None
68+
height: int | None = None
69+
rotation: float | None = None
70+
fill_color: str | None = None
71+
line_color: str | None = None
72+
text_content: str | None = None
73+
74+
75+
@dataclass
76+
class ExtractedSpeakerNotes:
77+
"""Represents speaker notes extracted from a PPTX file."""
78+
79+
content: str
80+
slide_index: int
81+
formatting: dict[str, Any] = field(default_factory=dict)
82+
83+
84+
@dataclass
85+
class ExtractedSlideImage:
86+
"""Represents a slide rendered as an image."""
87+
88+
image_bytes: bytes
89+
slide_index: int
90+
width: int
91+
height: int
92+
format: str = "PNG"
93+
94+
95+
@dataclass
96+
class ExtractedMetadata:
97+
"""Represents document metadata extracted from a PPTX file."""
98+
99+
title: str | None = None
100+
author: str | None = None
101+
subject: str | None = None
102+
keywords: str | None = None
103+
comments: str | None = None
104+
category: str | None = None
105+
created: datetime | None = None
106+
modified: datetime | None = None
107+
last_modified_by: str | None = None
108+
last_printed: datetime | None = None
109+
revision: int | None = None
110+
version: str | None = None
111+
language: str | None = None
112+
content_status: str | None = None
113+
identifier: str | None = None

0 commit comments

Comments
 (0)