Skip to content

Commit a4a791e

Browse files
Revert "Remove fs"
This reverts commit baa11ee.
1 parent f51cb0d commit a4a791e

File tree

4 files changed

+94
-20
lines changed

4 files changed

+94
-20
lines changed

examples/customize/build_graph/components/loaders/custom_loader.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from pathlib import Path
44
from typing import Dict, Optional, Union
55

6+
from fsspec import AbstractFileSystem
7+
68
from neo4j_graphrag.experimental.components.data_loader import DataLoader
79
from neo4j_graphrag.experimental.components.types import DocumentInfo, LoadedDocument
810

@@ -12,8 +14,10 @@ async def run(
1214
self,
1315
filepath: Union[str, Path],
1416
metadata: Optional[Dict[str, str]] = None,
17+
fs: Optional[Union[AbstractFileSystem, str]] = None,
1518
) -> LoadedDocument:
16-
# Implement logic here to read and transform the input file.
19+
# Implement logic here; use ``fs`` when reading from non-local storage.
20+
_ = fs
1721
return LoadedDocument(
1822
text="<extracted text>",
1923
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/components/data_loader.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,15 @@
1616

1717
from __future__ import annotations
1818

19+
import io
1920
from abc import abstractmethod
2021
from pathlib import Path
21-
from typing import Dict, Optional, Union
22+
from typing import Dict, Optional, Union, cast
2223

24+
import fsspec
2325
import pypdf
26+
from fsspec import AbstractFileSystem
27+
from fsspec.implementations.local import LocalFileSystem
2428

2529
from neo4j_graphrag.exceptions import MarkdownLoadError, PdfLoaderError
2630
from neo4j_graphrag.experimental.components.types import (
@@ -31,6 +35,10 @@
3135
from neo4j_graphrag.experimental.pipeline.component import Component
3236

3337

38+
def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
39+
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
40+
41+
3442
class DataLoader(Component):
3543
"""
3644
Interface for loading data of various input types.
@@ -46,18 +54,23 @@ async def run(
4654
self,
4755
filepath: Union[str, Path],
4856
metadata: Optional[Dict[str, str]] = None,
57+
fs: Optional[Union[AbstractFileSystem, str]] = None,
4958
) -> LoadedDocument: ...
5059

5160

5261
class PdfLoader(DataLoader):
5362
"""Loads text from PDF files using pypdf."""
5463

5564
@staticmethod
56-
def load_file(file: str) -> str:
65+
def load_file(
66+
file: str,
67+
fs: AbstractFileSystem,
68+
) -> str:
5769
"""Parse a PDF file and return extracted text."""
5870
try:
59-
with open(file, "rb") as fp:
60-
pdf = pypdf.PdfReader(fp)
71+
with fs.open(file, "rb") as fp:
72+
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
73+
pdf = pypdf.PdfReader(stream)
6174
num_pages = len(pdf.pages)
6275
text_parts = (
6376
pdf.pages[page].extract_text() for page in range(num_pages)
@@ -70,10 +83,15 @@ async def run(
7083
self,
7184
filepath: Union[str, Path],
7285
metadata: Optional[Dict[str, str]] = None,
86+
fs: Optional[Union[AbstractFileSystem, str]] = None,
7387
) -> LoadedDocument:
7488
if not isinstance(filepath, str):
7589
filepath = str(filepath)
76-
text = self.load_file(filepath)
90+
if isinstance(fs, str):
91+
fs = fsspec.filesystem(fs)
92+
elif fs is None:
93+
fs = LocalFileSystem()
94+
text = self.load_file(filepath, fs)
7795
return LoadedDocument(
7896
text=text,
7997
document_info=DocumentInfo(
@@ -88,22 +106,30 @@ class MarkdownLoader(DataLoader):
88106
"""Loads UTF-8 Markdown (``.md`` / ``.markdown``) files as plain text."""
89107

90108
@staticmethod
91-
def load_file(file: str) -> str:
109+
def load_file(
110+
file: str,
111+
fs: AbstractFileSystem,
112+
) -> str:
92113
try:
93-
with open(file, "rb") as fp:
114+
with fs.open(file, "rb") as fp:
94115
raw = fp.read()
95-
return raw.decode("utf-8")
116+
return cast(str, raw.decode("utf-8"))
96117
except Exception as e:
97118
raise MarkdownLoadError(e)
98119

99120
async def run(
100121
self,
101122
filepath: Union[str, Path],
102123
metadata: Optional[Dict[str, str]] = None,
124+
fs: Optional[Union[AbstractFileSystem, str]] = None,
103125
) -> LoadedDocument:
104126
if not isinstance(filepath, str):
105127
filepath = str(filepath)
106-
text = MarkdownLoader.load_file(filepath)
128+
if isinstance(fs, str):
129+
fs = fsspec.filesystem(fs)
130+
elif fs is None:
131+
fs = LocalFileSystem()
132+
text = MarkdownLoader.load_file(filepath, fs)
107133
return LoadedDocument(
108134
text=text,
109135
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
import warnings
2828

29+
from fsspec import AbstractFileSystem
2930
from pydantic import ConfigDict, Field, field_validator, model_validator
3031
from typing_extensions import Self
3132

@@ -83,13 +84,16 @@ async def run(
8384
self,
8485
filepath: Union[str, Path],
8586
metadata: Optional[dict[str, str]] = None,
87+
fs: Optional[Union[AbstractFileSystem, str]] = None,
8688
) -> LoadedDocument:
8789
path_str = str(filepath)
8890
suffix = Path(path_str).suffix.lower()
8991
if suffix == ".pdf":
90-
return await PdfLoader().run(filepath=path_str, metadata=metadata)
92+
return await PdfLoader().run(filepath=path_str, metadata=metadata, fs=fs)
9193
if suffix in (".md", ".markdown"):
92-
return await MarkdownLoader().run(filepath=path_str, metadata=metadata)
94+
return await MarkdownLoader().run(
95+
filepath=path_str, metadata=metadata, fs=fs
96+
)
9397
raise UnsupportedDocumentFormatError(
9498
f"Unsupported document format: {suffix!r}. "
9599
f"Supported: .pdf, .md, .markdown"

tests/unit/experimental/components/test_data_loader.py

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
from unittest.mock import patch
2121

2222
import pytest
23+
from fsspec import AbstractFileSystem
24+
from fsspec.implementations.local import LocalFileSystem
2325
from neo4j_graphrag.exceptions import MarkdownLoadError, PdfLoaderError
2426
from neo4j_graphrag.experimental.components.data_loader import (
2527
MarkdownLoader,
@@ -47,25 +49,31 @@ def dummy_md_path() -> str:
4749

4850
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
4951
expected_content = "Lorem ipsum dolor sit amet."
50-
actual_content = pdf_loader.load_file(dummy_pdf_path)
52+
actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
5153
assert actual_content == expected_content
5254

5355

5456
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
55-
with patch("builtins.open", side_effect=Exception("Failed to open")):
57+
with patch(
58+
"fsspec.implementations.local.LocalFileSystem.open",
59+
side_effect=Exception("Failed to open"),
60+
):
5661
with pytest.raises(PdfLoaderError):
57-
pdf_loader.load_file(dummy_pdf_path)
62+
pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
5863

5964

6065
def test_markdown_processing_error(dummy_md_path: str) -> None:
61-
with patch("builtins.open", side_effect=Exception("Failed to open")):
66+
with patch(
67+
"fsspec.implementations.local.LocalFileSystem.open",
68+
side_effect=Exception("Failed to open"),
69+
):
6270
with pytest.raises(MarkdownLoadError):
63-
MarkdownLoader.load_file(dummy_md_path)
71+
MarkdownLoader.load_file(dummy_md_path, fs=LocalFileSystem())
6472

6573

6674
def test_markdown_loading() -> None:
6775
md_path = str(BASE_DIR / "sample_data/hello.md")
68-
text = MarkdownLoader.load_file(md_path)
76+
text = MarkdownLoader.load_file(md_path, fs=LocalFileSystem())
6977
assert "# Hello" in text
7078
assert "Markdown **content**" in text
7179

@@ -81,7 +89,7 @@ async def test_markdown_loader_run() -> None:
8189

8290
@pytest.mark.asyncio
8391
async def test_pdf_loader_run() -> None:
84-
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo`."""
92+
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo` (default ``fs``)."""
8593
pdf_path = BASE_DIR / "sample_data/lorem_ipsum.pdf"
8694
loader = PdfLoader()
8795
doc = await loader.run(filepath=pdf_path)
@@ -90,6 +98,25 @@ async def test_pdf_loader_run() -> None:
9098
assert doc.text == "Lorem ipsum dolor sit amet."
9199

92100

101+
@pytest.mark.asyncio
102+
async def test_pdf_loader_run_fs_string_resolves_with_fsspec(
103+
dummy_pdf_path: str,
104+
) -> None:
105+
"""``fs`` may be a protocol name passed to ``fsspec.filesystem`` (e.g. ``\"file\"``)."""
106+
loader = PdfLoader()
107+
doc = await loader.run(filepath=dummy_pdf_path, fs="file")
108+
assert "Lorem ipsum" in doc.text
109+
110+
111+
@pytest.mark.asyncio
112+
async def test_markdown_loader_run_fs_string() -> None:
113+
md_path = str(BASE_DIR / "sample_data/hello.md")
114+
loader = MarkdownLoader()
115+
doc = await loader.run(filepath=md_path, fs="file")
116+
assert doc.document_info.document_type == DocumentType.MARKDOWN
117+
assert "# Hello" in doc.text
118+
119+
93120
@pytest.mark.asyncio
94121
async def test_run_passes_metadata_to_document_info(dummy_pdf_path: str) -> None:
95122
loader = PdfLoader()
@@ -105,8 +132,9 @@ async def run(
105132
self,
106133
filepath: Union[str, Path],
107134
metadata: Optional[dict[str, str]] = None,
135+
fs: Optional[Union[AbstractFileSystem, str]] = None,
108136
) -> LoadedDocument:
109-
return await super().run(filepath=filepath, metadata=metadata)
137+
return await super().run(filepath=filepath, metadata=metadata, fs=fs)
110138

111139
def get_document_metadata(
112140
self, text: str, metadata: dict[str, str] | None = None
@@ -130,6 +158,18 @@ async def test_get_document_metadata_override_merges_into_document_info(
130158
assert doc.document_info.metadata["text_length"] == str(len(doc.text))
131159

132160

161+
def test_pdf_loader_non_local_filesystem_branch_uses_bytesio(
162+
dummy_pdf_path: str,
163+
) -> None:
164+
"""Non-\"default\" local FS (``auto_mkdir=True``) reads into BytesIO for pypdf."""
165+
from neo4j_graphrag.experimental.components.data_loader import is_default_fs
166+
167+
fs = LocalFileSystem(auto_mkdir=True)
168+
assert is_default_fs(fs) is False
169+
text = PdfLoader.load_file(dummy_pdf_path, fs=fs)
170+
assert text == "Lorem ipsum dolor sit amet."
171+
172+
133173
def test_pdf_loader_backward_compat_reexport_module() -> None:
134174
"""``pdf_loader`` submodule re-exports the same classes as ``data_loader``."""
135175
from neo4j_graphrag.experimental.components.data_loader import (

0 commit comments

Comments
 (0)