Skip to content

Commit 60fa14e

Browse files
Add max_chars
1 parent baa11ee commit 60fa14e

File tree

5 files changed

+56
-5
lines changed

5 files changed

+56
-5
lines changed

examples/customize/build_graph/components/loaders/custom_loader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ async def run(
1212
self,
1313
filepath: Union[str, Path],
1414
metadata: Optional[Dict[str, str]] = None,
15+
max_chars: Optional[int] = None,
1516
) -> LoadedDocument:
1617
# Implement logic here to read and transform the input file.
18+
_ = max_chars
1719
return LoadedDocument(
1820
text="<extracted text>",
1921
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/components/data_loader.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,20 @@ def get_document_metadata(
4141
) -> Dict[str, str] | None:
4242
return metadata
4343

44+
@staticmethod
45+
def _apply_max_chars(text: str, max_chars: Optional[int] = None) -> str:
46+
if max_chars is None:
47+
return text
48+
if max_chars < 0:
49+
raise ValueError("max_chars must be >= 0")
50+
return text[:max_chars]
51+
4452
@abstractmethod
4553
async def run(
4654
self,
4755
filepath: Union[str, Path],
4856
metadata: Optional[Dict[str, str]] = None,
57+
max_chars: Optional[int] = None,
4958
) -> LoadedDocument: ...
5059

5160

@@ -70,10 +79,11 @@ async def run(
7079
self,
7180
filepath: Union[str, Path],
7281
metadata: Optional[Dict[str, str]] = None,
82+
max_chars: Optional[int] = None,
7383
) -> LoadedDocument:
7484
if not isinstance(filepath, str):
7585
filepath = str(filepath)
76-
text = self.load_file(filepath)
86+
text = self._apply_max_chars(self.load_file(filepath), max_chars=max_chars)
7787
return LoadedDocument(
7888
text=text,
7989
document_info=DocumentInfo(
@@ -100,10 +110,13 @@ async def run(
100110
self,
101111
filepath: Union[str, Path],
102112
metadata: Optional[Dict[str, str]] = None,
113+
max_chars: Optional[int] = None,
103114
) -> LoadedDocument:
104115
if not isinstance(filepath, str):
105116
filepath = str(filepath)
106-
text = MarkdownLoader.load_file(filepath)
117+
text = self._apply_max_chars(
118+
MarkdownLoader.load_file(filepath), max_chars=max_chars
119+
)
107120
return LoadedDocument(
108121
text=text,
109122
document_info=DocumentInfo(

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,18 @@ async def run(
8383
self,
8484
filepath: Union[str, Path],
8585
metadata: Optional[dict[str, str]] = None,
86+
max_chars: Optional[int] = None,
8687
) -> LoadedDocument:
8788
path_str = str(filepath)
8889
suffix = Path(path_str).suffix.lower()
8990
if suffix == ".pdf":
90-
return await PdfLoader().run(filepath=path_str, metadata=metadata)
91+
return await PdfLoader().run(
92+
filepath=path_str, metadata=metadata, max_chars=max_chars
93+
)
9194
if suffix in (".md", ".markdown"):
92-
return await MarkdownLoader().run(filepath=path_str, metadata=metadata)
95+
return await MarkdownLoader().run(
96+
filepath=path_str, metadata=metadata, max_chars=max_chars
97+
)
9398
raise UnsupportedDocumentFormatError(
9499
f"Unsupported document format: {suffix!r}. "
95100
f"Supported: .pdf, .md, .markdown"
@@ -426,6 +431,9 @@ def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]:
426431
)
427432
run_params["file_loader"]["filepath"] = file_path
428433
run_params["file_loader"]["metadata"] = user_input.get("document_metadata")
434+
max_chars = user_input.get("max_chars")
435+
if max_chars is not None:
436+
run_params["file_loader"]["max_chars"] = max_chars
429437
else:
430438
if not text:
431439
raise PipelineDefinitionError(

tests/unit/experimental/components/test_data_loader.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ async def test_markdown_loader_run() -> None:
7979
assert "# Hello" in doc.text
8080

8181

82+
@pytest.mark.asyncio
83+
async def test_markdown_loader_run_max_chars() -> None:
84+
md_path = BASE_DIR / "sample_data/hello.md"
85+
loader = MarkdownLoader()
86+
doc = await loader.run(filepath=md_path, max_chars=7)
87+
assert doc.text == "# Hello"
88+
89+
8290
@pytest.mark.asyncio
8391
async def test_pdf_loader_run() -> None:
8492
"""``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo`."""
@@ -90,6 +98,14 @@ async def test_pdf_loader_run() -> None:
9098
assert doc.text == "Lorem ipsum dolor sit amet."
9199

92100

101+
@pytest.mark.asyncio
102+
async def test_pdf_loader_run_max_chars() -> None:
103+
pdf_path = BASE_DIR / "sample_data/lorem_ipsum.pdf"
104+
loader = PdfLoader()
105+
doc = await loader.run(filepath=pdf_path, max_chars=5)
106+
assert doc.text == "Lorem"
107+
108+
93109
@pytest.mark.asyncio
94110
async def test_run_passes_metadata_to_document_info(dummy_pdf_path: str) -> None:
95111
loader = PdfLoader()
@@ -105,8 +121,11 @@ async def run(
105121
self,
106122
filepath: Union[str, Path],
107123
metadata: Optional[dict[str, str]] = None,
124+
max_chars: Optional[int] = None,
108125
) -> LoadedDocument:
109-
return await super().run(filepath=filepath, metadata=metadata)
126+
return await super().run(
127+
filepath=filepath, metadata=metadata, max_chars=max_chars
128+
)
110129

111130
def get_document_metadata(
112131
self, text: str, metadata: dict[str, str] | None = None

tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,15 @@ def test_simple_kg_pipeline_config_run_params_from_file_file_path() -> None:
370370
}
371371

372372

373+
def test_simple_kg_pipeline_config_run_params_from_file_file_path_with_max_chars() -> (
374+
None
375+
):
376+
config = SimpleKGPipelineConfig(from_file=True)
377+
assert config.get_run_params({"file_path": "my_file", "max_chars": 32}) == {
378+
"file_loader": {"filepath": "my_file", "metadata": None, "max_chars": 32}
379+
}
380+
381+
373382
def test_simple_kg_pipeline_config_run_params_from_text_text() -> None:
374383
config = SimpleKGPipelineConfig(from_file=False)
375384
run_params = config.get_run_params({"text": "my text"})

0 commit comments

Comments
 (0)