Skip to content

Commit f844bad

Browse files
committed
try rewriting source to reference git repo and path
1 parent 0072e2a commit f844bad

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

loaders/git.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,27 @@ def load(self) -> List[Document]:
8181
pdf_files = [f for f in matched_files if f.suffix.lower() == ".pdf"]
8282
text_files = [f for f in matched_files if f.suffix.lower() != ".pdf"]
8383

84+
docs: List[Document] = []
8485
if pdf_files:
8586
logger.info("Loading %d PDF file(s) from %s", len(pdf_files), repo_url)
86-
all_chunks.extend(self.pdf_loader.load(pdf_files))
87+
docs.extend(self.pdf_loader.load(pdf_files))
8788

8889
if text_files:
8990
logger.info(
9091
"Loading %d text file(s) from %s", len(text_files), repo_url
9192
)
92-
all_chunks.extend(self.text_loader.load(text_files))
93+
docs.extend(self.text_loader.load(text_files))
94+
95+
for doc in docs:
96+
local_src = Path(doc.metadata.get("source", ""))
97+
try:
98+
rel_path = local_src.relative_to(repo_path)
99+
except ValueError:
100+
rel_path = local_src
101+
102+
doc.metadata.update({"source": f"{repo_url}@{rel_path.as_posix()}"})
103+
104+
all_chunks.extend(docs)
93105

94106
return all_chunks
95107

0 commit comments

Comments
 (0)