Skip to content

Commit 2c09b17

Browse files
Add kwargs
1 parent 13aca76 commit 2c09b17

File tree

4 files changed

+10
-66
lines changed

4 files changed

+10
-66
lines changed

src/neo4j_graphrag/experimental/components/data_loader.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import io
2020
from abc import abstractmethod
2121
from pathlib import Path
22-
from typing import Dict, Optional, Union, cast
22+
from typing import Any, Dict, Optional, Union, cast
2323

2424
import fsspec
2525
import pypdf
@@ -49,21 +49,12 @@ def get_document_metadata(
4949
) -> Dict[str, str] | None:
5050
return metadata
5151

52-
@staticmethod
53-
def _apply_max_chars(text: str, max_chars: Optional[int] = None) -> str:
54-
if max_chars is None:
55-
return text
56-
if max_chars < 0:
57-
raise ValueError("max_chars must be >= 0")
58-
return text[:max_chars]
59-
6052
@abstractmethod
6153
async def run(
6254
self,
6355
filepath: Union[str, Path],
6456
metadata: Optional[Dict[str, str]] = None,
65-
fs: Optional[Union[AbstractFileSystem, str]] = None,
66-
max_chars: Optional[int] = None,
57+
**kwargs: Any,
6758
) -> LoadedDocument: ...
6859

6960

@@ -92,18 +83,16 @@ async def run(
9283
self,
9384
filepath: Union[str, Path],
9485
metadata: Optional[Dict[str, str]] = None,
95-
fs: Optional[Union[AbstractFileSystem, str]] = None,
96-
max_chars: Optional[int] = None,
86+
**kwargs: Any,
9787
) -> LoadedDocument:
88+
fs = kwargs.get("fs")
9889
if not isinstance(filepath, str):
9990
filepath = str(filepath)
10091
if isinstance(fs, str):
10192
fs = fsspec.filesystem(fs)
10293
elif fs is None:
10394
fs = LocalFileSystem()
104-
text = self._apply_max_chars(
105-
self.load_file(filepath, fs), max_chars=max_chars
106-
)
95+
text = self.load_file(filepath, fs)
10796
return LoadedDocument(
10897
text=text,
10998
document_info=DocumentInfo(
@@ -133,18 +122,16 @@ async def run(
133122
self,
134123
filepath: Union[str, Path],
135124
metadata: Optional[Dict[str, str]] = None,
136-
fs: Optional[Union[AbstractFileSystem, str]] = None,
137-
max_chars: Optional[int] = None,
125+
**kwargs: Any,
138126
) -> LoadedDocument:
127+
fs = kwargs.get("fs")
139128
if not isinstance(filepath, str):
140129
filepath = str(filepath)
141130
if isinstance(fs, str):
142131
fs = fsspec.filesystem(fs)
143132
elif fs is None:
144133
fs = LocalFileSystem()
145-
text = self._apply_max_chars(
146-
MarkdownLoader.load_file(filepath, fs), max_chars=max_chars
147-
)
134+
text = MarkdownLoader.load_file(filepath, fs)
148135
return LoadedDocument(
149136
text=text,
150137
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
)
2727
import warnings
2828

29-
from fsspec import AbstractFileSystem
3029
from pydantic import ConfigDict, Field, field_validator, model_validator
3130
from typing_extensions import Self
3231

@@ -84,24 +83,22 @@ async def run(
8483
self,
8584
filepath: Union[str, Path],
8685
metadata: Optional[dict[str, str]] = None,
87-
fs: Optional[Union[AbstractFileSystem, str]] = None,
88-
max_chars: Optional[int] = None,
86+
**kwargs: Any,
8987
) -> LoadedDocument:
88+
fs = kwargs.get("fs")
9089
path_str = str(filepath)
9190
suffix = Path(path_str).suffix.lower()
9291
if suffix == ".pdf":
9392
return await PdfLoader().run(
9493
filepath=path_str,
9594
metadata=metadata,
9695
fs=fs,
97-
max_chars=max_chars,
9896
)
9997
if suffix in (".md", ".markdown"):
10098
return await MarkdownLoader().run(
10199
filepath=path_str,
102100
metadata=metadata,
103101
fs=fs,
104-
max_chars=max_chars,
105102
)
106103
raise UnsupportedDocumentFormatError(
107104
f"Unsupported document format: {suffix!r}. "
@@ -439,10 +436,6 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]:
439436
)
440437
run_params["file_loader"]["filepath"] = file_path
441438
run_params["file_loader"]["metadata"] = user_input.get("document_metadata")
442-
max_chars = user_input.get("max_chars")
443-
# Backward-compatible: only forward new arg for the default loader.
444-
if max_chars is not None and self.file_loader is None:
445-
run_params["file_loader"]["max_chars"] = max_chars
446439
else:
447440
if not text:
448441
raise PipelineDefinitionError(

tests/unit/experimental/components/test_data_loader.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,6 @@ async def test_markdown_loader_run() -> None:
8787
assert "# Hello" in doc.text
8888

8989

90-
@pytest.mark.asyncio
91-
async def test_markdown_loader_run_max_chars() -> None:
92-
md_path = str(BASE_DIR / "sample_data/hello.md")
93-
loader = MarkdownLoader()
94-
doc = await loader.run(filepath=md_path, max_chars=7)
95-
assert doc.text == "# Hello"
96-
97-
9890
@pytest.mark.asyncio
9991
async def test_pdf_loader_run() -> None:
10092
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo` (default ``fs``)."""
@@ -106,14 +98,6 @@ async def test_pdf_loader_run() -> None:
10698
assert doc.text == "Lorem ipsum dolor sit amet."
10799

108100

109-
@pytest.mark.asyncio
110-
async def test_pdf_loader_run_max_chars() -> None:
111-
pdf_path = str(BASE_DIR / "sample_data/lorem_ipsum.pdf")
112-
loader = PdfLoader()
113-
doc = await loader.run(filepath=pdf_path, max_chars=5)
114-
assert doc.text == "Lorem"
115-
116-
117101
@pytest.mark.asyncio
118102
async def test_pdf_loader_run_fs_string_resolves_with_fsspec(
119103
dummy_pdf_path: str,

tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -370,26 +370,6 @@ def test_simple_kg_pipeline_config_run_params_from_file_file_path() -> None:
370370
}
371371

372372

373-
def test_simple_kg_pipeline_config_run_params_from_file_file_path_with_max_chars() -> (
374-
None
375-
):
376-
config = SimpleKGPipelineConfig(from_file=True)
377-
assert config.get_run_params({"file_path": "my_file", "max_chars": 42}) == {
378-
"file_loader": {"filepath": "my_file", "metadata": None, "max_chars": 42}
379-
}
380-
381-
382-
def test_simple_kg_pipeline_config_run_params_custom_file_loader_ignores_max_chars() -> (
383-
None
384-
):
385-
config = SimpleKGPipelineConfig(
386-
from_file=True, file_loader=ComponentType(PdfLoader())
387-
)
388-
assert config.get_run_params({"file_path": "my_file", "max_chars": 42}) == {
389-
"file_loader": {"filepath": "my_file", "metadata": None}
390-
}
391-
392-
393373
def test_simple_kg_pipeline_config_run_params_from_text_text() -> None:
394374
config = SimpleKGPipelineConfig(from_file=False)
395375
run_params = config.get_run_params({"text": "my text"})

0 commit comments

Comments
 (0)