2020from unittest .mock import patch
2121
2222import pytest
23+ from fsspec import AbstractFileSystem
24+ from fsspec .implementations .local import LocalFileSystem
2325from neo4j_graphrag .exceptions import MarkdownLoadError , PdfLoaderError
2426from neo4j_graphrag .experimental .components .data_loader import (
2527 MarkdownLoader ,
@@ -47,25 +49,31 @@ def dummy_md_path() -> str:
4749
4850def test_pdf_loading (pdf_loader : PdfLoader , dummy_pdf_path : str ) -> None :
4951 expected_content = "Lorem ipsum dolor sit amet."
50- actual_content = pdf_loader .load_file (dummy_pdf_path )
52+ actual_content = pdf_loader .load_file (dummy_pdf_path , fs = LocalFileSystem () )
5153 assert actual_content == expected_content
5254
5355
5456def test_pdf_processing_error (pdf_loader : PdfLoader , dummy_pdf_path : str ) -> None :
55- with patch ("builtins.open" , side_effect = Exception ("Failed to open" )):
57+ with patch (
58+ "fsspec.implementations.local.LocalFileSystem.open" ,
59+ side_effect = Exception ("Failed to open" ),
60+ ):
5661 with pytest .raises (PdfLoaderError ):
57- pdf_loader .load_file (dummy_pdf_path )
62+ pdf_loader .load_file (dummy_pdf_path , fs = LocalFileSystem () )
5863
5964
6065def test_markdown_processing_error (dummy_md_path : str ) -> None :
61- with patch ("builtins.open" , side_effect = Exception ("Failed to open" )):
66+ with patch (
67+ "fsspec.implementations.local.LocalFileSystem.open" ,
68+ side_effect = Exception ("Failed to open" ),
69+ ):
6270 with pytest .raises (MarkdownLoadError ):
63- MarkdownLoader .load_file (dummy_md_path )
71+ MarkdownLoader .load_file (dummy_md_path , fs = LocalFileSystem () )
6472
6573
6674def test_markdown_loading () -> None :
6775 md_path = str (BASE_DIR / "sample_data/hello.md" )
68- text = MarkdownLoader .load_file (md_path )
76+ text = MarkdownLoader .load_file (md_path , fs = LocalFileSystem () )
6977 assert "# Hello" in text
7078 assert "Markdown **content**" in text
7179
@@ -81,7 +89,7 @@ async def test_markdown_loader_run() -> None:
8189
8290@pytest .mark .asyncio
8391async def test_pdf_loader_run () -> None :
84- """``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo`."""
92+ """``PdfLoader.run`` wraps ``load_file`` with :class:`DocumentInfo` (default ``fs``) ."""
8593 pdf_path = BASE_DIR / "sample_data/lorem_ipsum.pdf"
8694 loader = PdfLoader ()
8795 doc = await loader .run (filepath = pdf_path )
@@ -90,6 +98,25 @@ async def test_pdf_loader_run() -> None:
9098 assert doc .text == "Lorem ipsum dolor sit amet."
9199
92100
101+ @pytest .mark .asyncio
102+ async def test_pdf_loader_run_fs_string_resolves_with_fsspec (
103+ dummy_pdf_path : str ,
104+ ) -> None :
105+ """``fs`` may be a protocol name passed to ``fsspec.filesystem`` (e.g. ``\" file\" ``)."""
106+ loader = PdfLoader ()
107+ doc = await loader .run (filepath = dummy_pdf_path , fs = "file" )
108+ assert "Lorem ipsum" in doc .text
109+
110+
111+ @pytest .mark .asyncio
112+ async def test_markdown_loader_run_fs_string () -> None :
113+ md_path = str (BASE_DIR / "sample_data/hello.md" )
114+ loader = MarkdownLoader ()
115+ doc = await loader .run (filepath = md_path , fs = "file" )
116+ assert doc .document_info .document_type == DocumentType .MARKDOWN
117+ assert "# Hello" in doc .text
118+
119+
93120@pytest .mark .asyncio
94121async def test_run_passes_metadata_to_document_info (dummy_pdf_path : str ) -> None :
95122 loader = PdfLoader ()
@@ -105,8 +132,9 @@ async def run(
105132 self ,
106133 filepath : Union [str , Path ],
107134 metadata : Optional [dict [str , str ]] = None ,
135+ fs : Optional [Union [AbstractFileSystem , str ]] = None ,
108136 ) -> LoadedDocument :
109- return await super ().run (filepath = filepath , metadata = metadata )
137+ return await super ().run (filepath = filepath , metadata = metadata , fs = fs )
110138
111139 def get_document_metadata (
112140 self , text : str , metadata : dict [str , str ] | None = None
@@ -130,6 +158,18 @@ async def test_get_document_metadata_override_merges_into_document_info(
130158 assert doc .document_info .metadata ["text_length" ] == str (len (doc .text ))
131159
132160
161+ def test_pdf_loader_non_local_filesystem_branch_uses_bytesio (
162+ dummy_pdf_path : str ,
163+ ) -> None :
164+ """Non-\" default\" local FS (``auto_mkdir=True``) reads into BytesIO for pypdf."""
165+ from neo4j_graphrag .experimental .components .data_loader import is_default_fs
166+
167+ fs = LocalFileSystem (auto_mkdir = True )
168+ assert is_default_fs (fs ) is False
169+ text = PdfLoader .load_file (dummy_pdf_path , fs = fs )
170+ assert text == "Lorem ipsum dolor sit amet."
171+
172+
133173def test_pdf_loader_backward_compat_reexport_module () -> None :
134174 """``pdf_loader`` submodule re-exports the same classes as ``data_loader``."""
135175 from neo4j_graphrag .experimental .components .data_loader import (
0 commit comments