Skip to content

Commit baa11ee

Browse files
Remove fs
1 parent fb57c60 commit baa11ee

File tree

4 files changed

+20
-94
lines changed

4 files changed

+20
-94
lines changed

examples/customize/build_graph/components/loaders/custom_loader.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from pathlib import Path
44
from typing import Dict, Optional, Union
55

6-
from fsspec import AbstractFileSystem
7-
86
from neo4j_graphrag.experimental.components.data_loader import DataLoader
97
from neo4j_graphrag.experimental.components.types import DocumentInfo, LoadedDocument
108

@@ -14,10 +12,8 @@ async def run(
1412
self,
1513
filepath: Union[str, Path],
1614
metadata: Optional[Dict[str, str]] = None,
17-
fs: Optional[Union[AbstractFileSystem, str]] = None,
1815
) -> LoadedDocument:
19-
# Implement logic here; use ``fs`` when reading from non-local storage.
20-
_ = fs
16+
# Implement logic here to read and transform the input file.
2117
return LoadedDocument(
2218
text="<extracted text>",
2319
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/components/data_loader.py

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,11 @@
1616

1717
from __future__ import annotations
1818

19-
import io
2019
from abc import abstractmethod
2120
from pathlib import Path
22-
from typing import Dict, Optional, Union, cast
21+
from typing import Dict, Optional, Union
2322

24-
import fsspec
2523
import pypdf
26-
from fsspec import AbstractFileSystem
27-
from fsspec.implementations.local import LocalFileSystem
2824

2925
from neo4j_graphrag.exceptions import MarkdownLoadError, PdfLoaderError
3026
from neo4j_graphrag.experimental.components.types import (
@@ -35,10 +31,6 @@
3531
from neo4j_graphrag.experimental.pipeline.component import Component
3632

3733

38-
def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
39-
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
40-
41-
4234
class DataLoader(Component):
4335
"""
4436
Interface for loading data of various input types.
@@ -54,23 +46,18 @@ async def run(
5446
self,
5547
filepath: Union[str, Path],
5648
metadata: Optional[Dict[str, str]] = None,
57-
fs: Optional[Union[AbstractFileSystem, str]] = None,
5849
) -> LoadedDocument: ...
5950

6051

6152
class PdfLoader(DataLoader):
6253
"""Loads text from PDF files using pypdf."""
6354

6455
@staticmethod
65-
def load_file(
66-
file: str,
67-
fs: AbstractFileSystem,
68-
) -> str:
56+
def load_file(file: str) -> str:
6957
"""Parse a PDF file and return extracted text."""
7058
try:
71-
with fs.open(file, "rb") as fp:
72-
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
73-
pdf = pypdf.PdfReader(stream)
59+
with open(file, "rb") as fp:
60+
pdf = pypdf.PdfReader(fp)
7461
num_pages = len(pdf.pages)
7562
text_parts = (
7663
pdf.pages[page].extract_text() for page in range(num_pages)
@@ -83,15 +70,10 @@ async def run(
8370
self,
8471
filepath: Union[str, Path],
8572
metadata: Optional[Dict[str, str]] = None,
86-
fs: Optional[Union[AbstractFileSystem, str]] = None,
8773
) -> LoadedDocument:
8874
if not isinstance(filepath, str):
8975
filepath = str(filepath)
90-
if isinstance(fs, str):
91-
fs = fsspec.filesystem(fs)
92-
elif fs is None:
93-
fs = LocalFileSystem()
94-
text = self.load_file(filepath, fs)
76+
text = self.load_file(filepath)
9577
return LoadedDocument(
9678
text=text,
9779
document_info=DocumentInfo(
@@ -106,30 +88,22 @@ class MarkdownLoader(DataLoader):
10688
"""Loads UTF-8 Markdown (``.md`` / ``.markdown``) files as plain text."""
10789

10890
@staticmethod
109-
def load_file(
110-
file: str,
111-
fs: AbstractFileSystem,
112-
) -> str:
91+
def load_file(file: str) -> str:
11392
try:
114-
with fs.open(file, "rb") as fp:
93+
with open(file, "rb") as fp:
11594
raw = fp.read()
116-
return cast(str, raw.decode("utf-8"))
95+
return raw.decode("utf-8")
11796
except Exception as e:
11897
raise MarkdownLoadError(e)
11998

12099
async def run(
121100
self,
122101
filepath: Union[str, Path],
123102
metadata: Optional[Dict[str, str]] = None,
124-
fs: Optional[Union[AbstractFileSystem, str]] = None,
125103
) -> LoadedDocument:
126104
if not isinstance(filepath, str):
127105
filepath = str(filepath)
128-
if isinstance(fs, str):
129-
fs = fsspec.filesystem(fs)
130-
elif fs is None:
131-
fs = LocalFileSystem()
132-
text = MarkdownLoader.load_file(filepath, fs)
106+
text = MarkdownLoader.load_file(filepath)
133107
return LoadedDocument(
134108
text=text,
135109
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
)
2727
import warnings
2828

29-
from fsspec import AbstractFileSystem
3029
from pydantic import ConfigDict, Field, field_validator, model_validator
3130
from typing_extensions import Self
3231

@@ -84,16 +83,13 @@ async def run(
8483
self,
8584
filepath: Union[str, Path],
8685
metadata: Optional[dict[str, str]] = None,
87-
fs: Optional[Union[AbstractFileSystem, str]] = None,
8886
) -> LoadedDocument:
8987
path_str = str(filepath)
9088
suffix = Path(path_str).suffix.lower()
9189
if suffix == ".pdf":
92-
return await PdfLoader().run(filepath=path_str, metadata=metadata, fs=fs)
90+
return await PdfLoader().run(filepath=path_str, metadata=metadata)
9391
if suffix in (".md", ".markdown"):
94-
return await MarkdownLoader().run(
95-
filepath=path_str, metadata=metadata, fs=fs
96-
)
92+
return await MarkdownLoader().run(filepath=path_str, metadata=metadata)
9793
raise UnsupportedDocumentFormatError(
9894
f"Unsupported document format: {suffix!r}. "
9995
f"Supported: .pdf, .md, .markdown"

tests/unit/experimental/components/test_data_loader.py

Lines changed: 8 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
from unittest.mock import patch
2121

2222
import pytest
23-
from fsspec import AbstractFileSystem
24-
from fsspec.implementations.local import LocalFileSystem
2523
from neo4j_graphrag.exceptions import MarkdownLoadError, PdfLoaderError
2624
from neo4j_graphrag.experimental.components.data_loader import (
2725
MarkdownLoader,
@@ -49,31 +47,25 @@ def dummy_md_path() -> str:
4947

5048
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
5149
expected_content = "Lorem ipsum dolor sit amet."
52-
actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
50+
actual_content = pdf_loader.load_file(dummy_pdf_path)
5351
assert actual_content == expected_content
5452

5553

5654
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
57-
with patch(
58-
"fsspec.implementations.local.LocalFileSystem.open",
59-
side_effect=Exception("Failed to open"),
60-
):
55+
with patch("builtins.open", side_effect=Exception("Failed to open")):
6156
with pytest.raises(PdfLoaderError):
62-
pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
57+
pdf_loader.load_file(dummy_pdf_path)
6358

6459

6560
def test_markdown_processing_error(dummy_md_path: str) -> None:
66-
with patch(
67-
"fsspec.implementations.local.LocalFileSystem.open",
68-
side_effect=Exception("Failed to open"),
69-
):
61+
with patch("builtins.open", side_effect=Exception("Failed to open")):
7062
with pytest.raises(MarkdownLoadError):
71-
MarkdownLoader.load_file(dummy_md_path, fs=LocalFileSystem())
63+
MarkdownLoader.load_file(dummy_md_path)
7264

7365

7466
def test_markdown_loading() -> None:
7567
md_path = str(BASE_DIR / "sample_data/hello.md")
76-
text = MarkdownLoader.load_file(md_path, fs=LocalFileSystem())
68+
text = MarkdownLoader.load_file(md_path)
7769
assert "# Hello" in text
7870
assert "Markdown **content**" in text
7971

@@ -89,7 +81,7 @@ async def test_markdown_loader_run() -> None:
8981

9082
@pytest.mark.asyncio
9183
async def test_pdf_loader_run() -> None:
92-
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo` (default ``fs``)."""
84+
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo`."""
9385
pdf_path = BASE_DIR / "sample_data/lorem_ipsum.pdf"
9486
loader = PdfLoader()
9587
doc = await loader.run(filepath=pdf_path)
@@ -98,25 +90,6 @@ async def test_pdf_loader_run() -> None:
9890
assert doc.text == "Lorem ipsum dolor sit amet."
9991

10092

101-
@pytest.mark.asyncio
102-
async def test_pdf_loader_run_fs_string_resolves_with_fsspec(
103-
dummy_pdf_path: str,
104-
) -> None:
105-
"""``fs`` may be a protocol name passed to ``fsspec.filesystem`` (e.g. ``\"file\"``)."""
106-
loader = PdfLoader()
107-
doc = await loader.run(filepath=dummy_pdf_path, fs="file")
108-
assert "Lorem ipsum" in doc.text
109-
110-
111-
@pytest.mark.asyncio
112-
async def test_markdown_loader_run_fs_string() -> None:
113-
md_path = str(BASE_DIR / "sample_data/hello.md")
114-
loader = MarkdownLoader()
115-
doc = await loader.run(filepath=md_path, fs="file")
116-
assert doc.document_info.document_type == DocumentType.MARKDOWN
117-
assert "# Hello" in doc.text
118-
119-
12093
@pytest.mark.asyncio
12194
async def test_run_passes_metadata_to_document_info(dummy_pdf_path: str) -> None:
12295
loader = PdfLoader()
@@ -132,9 +105,8 @@ async def run(
132105
self,
133106
filepath: Union[str, Path],
134107
metadata: Optional[dict[str, str]] = None,
135-
fs: Optional[Union[AbstractFileSystem, str]] = None,
136108
) -> LoadedDocument:
137-
return await super().run(filepath=filepath, metadata=metadata, fs=fs)
109+
return await super().run(filepath=filepath, metadata=metadata)
138110

139111
def get_document_metadata(
140112
self, text: str, metadata: dict[str, str] | None = None
@@ -158,18 +130,6 @@ async def test_get_document_metadata_override_merges_into_document_info(
158130
assert doc.document_info.metadata["text_length"] == str(len(doc.text))
159131

160132

161-
def test_pdf_loader_non_local_filesystem_branch_uses_bytesio(
162-
dummy_pdf_path: str,
163-
) -> None:
164-
"""Non-\"default\" local FS (``auto_mkdir=True``) reads into BytesIO for pypdf."""
165-
from neo4j_graphrag.experimental.components.data_loader import is_default_fs
166-
167-
fs = LocalFileSystem(auto_mkdir=True)
168-
assert is_default_fs(fs) is False
169-
text = PdfLoader.load_file(dummy_pdf_path, fs=fs)
170-
assert text == "Lorem ipsum dolor sit amet."
171-
172-
173133
def test_pdf_loader_backward_compat_reexport_module() -> None:
174134
"""``pdf_loader`` submodule re-exports the same classes as ``data_loader``."""
175135
from neo4j_graphrag.experimental.components.data_loader import (

0 commit comments

Comments
 (0)