Skip to content

Commit 40bcf6f

Browse files
authored
fix bug: reader cannot set metadata to DocNode (#540)
1 parent 9c5df30 commit 40bcf6f

File tree

2 files changed

+24
-4
lines changed

2 files changed

+24
-4
lines changed

lazyllm/tools/rag/dataReader.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,11 @@ def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]:
198198
def load_file(input_file: Path, metadata_genf: Callable[[str], Dict], file_extractor: Dict[str, Callable],
199199
encoding: str = "utf-8", pathm: PurePath = Path, fs: Optional[AbstractFileSystem] = None,
200200
metadata: Optional[Dict] = None) -> List[DocNode]:
201-
metadata: dict = metadata or {}
201+
# metadata priority: user > reader > metadata_genf
202+
user_metadata: Dict = metadata or {}
203+
metadata_generated: Dict = metadata_genf(str(input_file)) if metadata_genf is not None else {}
202204
documents: List[DocNode] = []
203205

204-
if metadata_genf is not None: metadata.update(metadata_genf(str(input_file)))
205206
file_reader_patterns = list(file_extractor.keys())
206207

207208
for pattern in file_reader_patterns:
@@ -213,8 +214,11 @@ def load_file(input_file: Path, metadata_genf: Callable[[str], Dict], file_extra
213214
kwargs = {'fs': fs} if fs and not is_default_fs(fs) else {}
214215
docs = reader(input_file, **kwargs)
215216
if isinstance(docs, DocNode): docs = [docs]
216-
for doc in docs: doc._global_metadata = metadata
217-
217+
for doc in docs:
218+
metadata = metadata_generated.copy()
219+
metadata.update(doc._global_metadata or {})
220+
metadata.update(user_metadata)
221+
doc._global_metadata = metadata
218222
if config['rag_filename_as_id']:
219223
for i, doc in enumerate(docs):
220224
doc._uid = f"{input_file!s}_index_{i}"

tests/basic_tests/test_rag_reader.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44
from lazyllm.tools.rag.readers import ReaderBase
55
from lazyllm.tools.rag import SimpleDirectoryReader, DocNode, Document
6+
from lazyllm.tools.rag.dataReader import RAG_DOC_CREATION_DATE
67

78
class YmlReader(ReaderBase):
89
def _load_data(self, file, fs=None):
@@ -19,6 +20,13 @@ def processYml(file):
1920
node._content = "Call the function processYml."
2021
return [node]
2122

23+
def processYmlWithMetadata(file):
24+
with open(file, 'r') as f:
25+
data = f.read()
26+
node = DocNode(text=data, metadata=dict(m='m'), global_metadata={RAG_DOC_CREATION_DATE: '00-00'})
27+
node._content = 'Call the function processYml.'
28+
return [node]
29+
2230
class TestRagReader(object):
2331
def setup_method(self):
2432
self.doc1 = Document(dataset_path="ci_data/rag_reader_full", manager=False)
@@ -62,6 +70,14 @@ def test_register_global_reader(self):
6270
docs = self.doc1._impl._reader.load_data(input_files=files)
6371
assert docs[0].text == "Call the function processYml."
6472

73+
def test_register_reader_metadata(self):
74+
self.doc1.add_reader('**/*.yml', processYmlWithMetadata)
75+
files = [os.path.join(self.datasets, 'reader_test.yml')]
76+
docs = self.doc1._impl._reader.load_data(input_files=files)
77+
assert docs[0].text == 'Call the function processYml.'
78+
assert docs[0].metadata.get('m') == 'm'
79+
assert docs[0].global_metadata.get(RAG_DOC_CREATION_DATE) == '00-00'
80+
6581
def test_register_local_and_global_reader(self):
6682
files = [os.path.join(self.datasets, "reader_test.yml")]
6783

0 commit comments

Comments
 (0)