Skip to content

Commit ea41ed3

Browse files
committed
refactor(pptx): convert extractor methods to static methods
- Updated several methods in the BasePptxExtractor and its subclasses to be static, improving clarity and reducing unnecessary instance dependencies. - Methods modified include _get_slides, _get_shape_info, _extract_text_content, _extract_hyperlink_content, _extract_image_content, _extract_shape_content, and get_extractor_name across various extractor classes.
1 parent 426a6ba commit ea41ed3

File tree

1 file changed

+25
-13
lines changed
  • packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors

1 file changed

+25
-13
lines changed

packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/pptx/extractors/extractors.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,14 @@
2525
class BasePptxExtractor(ABC):
2626
"""Base class for all PPTX content extractors."""
2727

28-
def _get_slides(self, presentation: Presentation, slide: Slide | None = None) -> list[tuple[int, Slide]]:
28+
@staticmethod
29+
def _get_slides(presentation: Presentation, slide: Slide | None = None) -> list[tuple[int, Slide]]:
2930
"""Get slides with their indices."""
3031
slides = [slide] if slide else list(presentation.slides)
3132
return list(enumerate(slides, start=1))
3233

33-
def _get_shape_info(self, shape: BaseShape) -> str:
34+
@staticmethod
35+
def _get_shape_info(shape: BaseShape) -> str:
3436
"""Get descriptive information about a shape for logging purposes."""
3537
try:
3638
shape_type = getattr(shape, "shape_type", "unknown")
@@ -40,8 +42,8 @@ def _get_shape_info(self, shape: BaseShape) -> str:
4042
except Exception:
4143
return "unknown_shape"
4244

45+
@staticmethod
4346
def _create_text_element(
44-
self,
4547
element_type: str,
4648
document_meta: DocumentMeta,
4749
content: str,
@@ -177,7 +179,8 @@ def get_extractor_name(self) -> str:
177179
class PptxTextExtractor(BasePptxExtractor):
178180
"""Extracts text content from text frames."""
179181

180-
def _extract_text_content(self, shape: BaseShape) -> str | None:
182+
@staticmethod
183+
def _extract_text_content(shape: BaseShape) -> str | None:
181184
"""Extract text content from a shape."""
182185
if not isinstance(shape, Shape):
183186
return None
@@ -201,15 +204,17 @@ def extract(
201204
logger.error("Text extraction failed: %s", str(e), exc_info=True)
202205
raise PptxExtractorError(self.get_extractor_name(), e) from e
203206

204-
def get_extractor_name(self) -> str:
207+
@staticmethod
208+
def get_extractor_name() -> str:
205209
"""Get the name of this extractor."""
206210
return "pptx_text_extractor"
207211

208212

209213
class PptxHyperlinkExtractor(BasePptxExtractor):
210214
"""Extracts hyperlink addresses from shapes."""
211215

212-
def _extract_hyperlink_content(self, shape: BaseShape) -> str | None:
216+
@staticmethod
217+
def _extract_hyperlink_content(shape: BaseShape) -> str | None:
213218
"""Extract hyperlink content from a shape."""
214219
if not hasattr(shape, "click_action") or isinstance(shape, GroupShape):
215220
return None
@@ -229,15 +234,17 @@ def extract(
229234
element_type="hyperlink",
230235
)
231236

232-
def get_extractor_name(self) -> str:
237+
@staticmethod
238+
def get_extractor_name() -> str:
233239
"""Get the name of this extractor."""
234240
return "pptx_hyperlink_extractor"
235241

236242

237243
class PptxImageExtractor(BasePptxExtractor):
238244
"""Extracts image information from shapes."""
239245

240-
def _extract_image_content(self, shape: BaseShape) -> str | None:
246+
@staticmethod
247+
def _extract_image_content(shape: BaseShape) -> str | None:
241248
"""Extract image content from a shape."""
242249
if not isinstance(shape, Picture):
243250
return None
@@ -258,15 +265,17 @@ def extract(
258265
element_type="image",
259266
)
260267

261-
def get_extractor_name(self) -> str:
268+
@staticmethod
269+
def get_extractor_name() -> str:
262270
"""Get the name of this extractor."""
263271
return "pptx_image_extractor"
264272

265273

266274
class PptxShapeExtractor(BasePptxExtractor):
267275
"""Extracts shape information and metadata."""
268276

269-
def _extract_shape_content(self, shape: BaseShape) -> str | None:
277+
@staticmethod
278+
def _extract_shape_content(shape: BaseShape) -> str | None:
270279
"""Extract shape metadata from a shape."""
271280
if not hasattr(shape, "shape_type"):
272281
return None
@@ -284,7 +293,8 @@ def extract(
284293
element_type="shape",
285294
)
286295

287-
def get_extractor_name(self) -> str:
296+
@staticmethod
297+
def get_extractor_name() -> str:
288298
"""Get the name of this extractor."""
289299
return "pptx_shape_extractor"
290300

@@ -327,7 +337,8 @@ def extract(
327337
logger.error("Metadata extraction failed: %s", str(e), exc_info=True)
328338
raise PptxExtractorError(self.get_extractor_name(), e) from e
329339

330-
def get_extractor_name(self) -> str:
340+
@staticmethod
341+
def get_extractor_name() -> str:
331342
"""Get the name of this extractor."""
332343
return "pptx_metadata_extractor"
333344

@@ -389,7 +400,8 @@ def extract(
389400
logger.error("Speaker notes extraction failed: %s", str(e), exc_info=True)
390401
raise PptxExtractorError(self.get_extractor_name(), e) from e
391402

392-
def get_extractor_name(self) -> str:
403+
@staticmethod
404+
def get_extractor_name() -> str:
393405
"""Get the name of this extractor."""
394406
return "pptx_speaker_notes_extractor"
395407

0 commit comments

Comments
 (0)