Skip to content

Commit 01275d6

Browse files
committed
feat(04-02): convert ImageLoader, PyMuPDF4LLMLoader, and PPTXLoader to non-blocking I/O
- ImageLoader: Wrap PIL.Image.open and cairosvg.svg2png with asyncio.to_thread - Created _load_image sync helper for SVG conversion and image loading - Updated save_content call to use await - PyMuPDF4LLMLoader: Wrap pymupdf4llm.to_markdown with asyncio.to_thread - Updated both PyMuPDFLoader and PyMuPDF4LLMLoader save_content calls to use await - PPTXLoader: Wrap PPTXConverter.convert with asyncio.to_thread - Offload pptx.Presentation and image blob reading to thread pool - Updated save_content call to use await
1 parent d96e7b2 commit 01275d6

File tree

3 files changed

+20
-12
lines changed

3 files changed

+20
-12
lines changed

openrag/components/indexer/loaders/image.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from io import BytesIO
23
from pathlib import Path
34

@@ -19,16 +20,19 @@ class ImageLoader(BaseLoader):
1920
def __init__(self, **kwargs):
2021
super().__init__(**kwargs)
2122

23+
def _load_image(self, path: Path):
24+
"""Load image file, converting SVG to PNG if needed."""
25+
if path.suffix.lower() == ".svg":
26+
png_data = cairosvg.svg2png(url=str(path))
27+
return Image.open(BytesIO(png_data))
28+
else:
29+
return Image.open(path)
30+
2231
async def aload_document(self, file_path, metadata=None, save_markdown=False):
2332
path = Path(file_path)
2433

2534
try:
26-
# Handle SVG files by converting to PNG first
27-
if path.suffix.lower() == ".svg":
28-
png_data = cairosvg.svg2png(url=str(path))
29-
img = Image.open(BytesIO(png_data))
30-
else:
31-
img = Image.open(path)
35+
img = await asyncio.to_thread(self._load_image, path)
3236
except OSError as e:
3337
# File not found, permission denied, etc.
3438
log.error("Cannot read image file", file_path=str(path), error=str(e))
@@ -50,5 +54,5 @@ async def aload_document(self, file_path, metadata=None, save_markdown=False):
5054
description = await self.get_image_description(image_data=img)
5155
doc = Document(page_content=description, metadata=metadata)
5256
if save_markdown:
53-
self.save_content(description, str(path))
57+
await self.save_content(description, str(path))
5458
return doc

openrag/components/indexer/loaders/pdf_loaders/pymupdf.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from pathlib import Path
23

34
import pymupdf4llm
@@ -23,7 +24,7 @@ async def aload_document(self, file_path, metadata: dict = None, save_markdown=F
2324

2425
doc = Document(page_content=s, metadata=metadata)
2526
if save_markdown:
26-
self.save_content(s, str(file_path))
27+
await self.save_content(s, str(file_path))
2728
return doc
2829

2930

@@ -32,13 +33,15 @@ def __init__(self, **kwargs) -> None:
3233
super().__init__(**kwargs)
3334

3435
async def aload_document(self, file_path, metadata: dict = None, save_markdown=False):
35-
pages = pymupdf4llm.to_markdown(file_path, write_images=False, page_chunks=True)
36+
pages = await asyncio.to_thread(
37+
pymupdf4llm.to_markdown, file_path, write_images=False, page_chunks=True
38+
)
3639

3740
s = ""
3841
for page_num, segment in enumerate(pages, start=1):
3942
s += segment.get("text").strip() + f"\n[PAGE_{page_num}]\n"
4043

4144
doc = Document(page_content=s, metadata=metadata)
4245
if save_markdown:
43-
self.save_content(s, str(file_path))
46+
await self.save_content(s, str(file_path))
4447
return doc

openrag/components/indexer/loaders/pptx_loader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import html
23
import re
34
from io import BytesIO
@@ -149,7 +150,7 @@ def __init__(self, **kwargs) -> None:
149150
self.converter = PPTXConverter(image_placeholder=self.image_placeholder, page_separator=self.page_sep)
150151

151152
async def aload_document(self, file_path, metadata=None, save_markdown=False):
152-
md_content, imgs = self.converter.convert(local_path=file_path)
153+
md_content, imgs = await asyncio.to_thread(self.converter.convert, local_path=file_path)
153154

154155
if self.image_captioning:
155156
images_captions = await self.caption_images(imgs, desc="Generating captions")
@@ -168,5 +169,5 @@ async def aload_document(self, file_path, metadata=None, save_markdown=False):
168169

169170
doc = Document(page_content=md_content, metadata=metadata)
170171
if save_markdown:
171-
self.save_content(md_content, str(file_path))
172+
await self.save_content(md_content, str(file_path))
172173
return doc

0 commit comments

Comments
 (0)