Skip to content

Commit 8b7cf2f

Browse files
committed
feat(rag): regenerate nml vector stores
1 parent 4c0b70f commit 8b7cf2f

File tree

32 files changed

+53
-20
lines changed

32 files changed

+53
-20
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ repos:
77
- id: trailing-whitespace
88
- id: end-of-file-fixer
99
- id: check-added-large-files
10-
args: [ "--maxkb=5000"]
10+
args: [ "--maxkb=30000"]
1111
- repo: https://github.com/astral-sh/ruff-pre-commit
1212
rev: v0.14.14
1313
hooks:

data-sources/neuroml/nml.py

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
Author: Ankur Sinha <sanjay DOT ankur AT gmail DOT com>
99
"""
1010

11+
import json
1112
import logging
1213
import mimetypes
1314
from glob import glob
@@ -20,20 +21,23 @@
2021
MarkdownHeaderTextSplitter,
2122
RecursiveCharacterTextSplitter,
2223
)
23-
from neuroml_ai.rag.utils import setup_embedding
24+
from neuroml_ai_utils.llm import setup_embedding
25+
26+
logging.basicConfig(level=logging.WARNING)
2427

2528

2629
class NML(object):
30+
# limit to two header levels
2731
md_headers_to_split_on = [
2832
("#", "Header 1"),
2933
("##", "Header 2"),
30-
("###", "Header 3"),
31-
("####", "Header 4"),
34+
# ("###", "Header 3"),
35+
# ("####", "Header 4"),
3236
]
3337

3438
"""NeuroML vector store generator"""
3539

36-
def __init__(self, embedding_model: str, logging_level: int = logging.INFO):
40+
def __init__(self, embedding_model: str, logging_level: int = logging.DEBUG):
3741
"""TODO: to be defined."""
3842
self.chunk_size = 600
3943
self.chunk_overlap = 60
@@ -42,9 +46,9 @@ def __init__(self, embedding_model: str, logging_level: int = logging.INFO):
4246
my_path = Path(__file__).parent
4347
self.stores_sources_path = f"{my_path}/sources"
4448

45-
self.logger = logging.getLogger("NeuroML-AI")
49+
self.logger = logging.getLogger("NeuroML-doc-embeddings")
4650
self.logger.setLevel(logging_level)
47-
self.logger.propagate = False
51+
self.logger.propagate = True
4852

4953
def setup(self):
5054
"""Setup embeddings"""
@@ -96,8 +100,19 @@ def create(self):
96100
client_settings=chroma_client_settings_text,
97101
)
98102

99-
info_files = glob(f"{src}/*", recursive=True)
103+
info_files = glob(f"{src}/*.md", recursive=True)
104+
url_maps = glob(f"{src}/*.json", recursive=True)
100105
self.logger.debug(f"Loaded {len(info_files)} files from {src}")
106+
self.logger.debug(f"Loaded {len(url_maps)} url map files from {src}")
107+
108+
# only a single url map file is allowed here
109+
assert len(url_maps) <= 1
110+
111+
if len(url_maps) == 1:
112+
with open(url_maps[0], "r") as f:
113+
url_map_data = json.load(f)
114+
else:
115+
url_map_data = {}
101116

102117
for info_file in info_files:
103118
try:
@@ -108,7 +123,7 @@ def create(self):
108123

109124
if file_type:
110125
if "markdown" in file_type:
111-
self.add_md(store, info_file)
126+
self.add_md(store, info_file, url_map_data)
112127
else:
113128
self.logger.warning(
114129
f"File {info_file} is of type {file_type} which is not currently supported. Skipping"
@@ -118,7 +133,7 @@ def create(self):
118133
f"Could not guess file type for file {info_file}. Skipping"
119134
)
120135

121-
def add_md(self, store, file):
136+
def add_md(self, store, file, url_map):
122137
"""Add a markdown file to the vector store
123138
124139
We add the file hash as extra metadata so that we can filter on it
@@ -160,13 +175,33 @@ def add_md(self, store, file):
160175
)
161176
splits = text_splitter.split_documents(md_splits)
162177
for split in splits:
163-
split.metadata.update(
164-
{
165-
"file_hash": file_hash,
166-
"file_name": file_path.name,
167-
"file_path": str(file_path),
168-
}
169-
)
178+
# get url
179+
# header 1
180+
url = None
181+
if "Header 1" in split.metadata.keys():
182+
url = url_map.get(split.metadata["Header 1"], None)
183+
# try header 2: more specific
184+
if "Header 2" in split.metadata.keys():
185+
url = url_map.get(split.metadata["Header 2"], None)
186+
# fall back to default url
187+
if not url:
188+
url = url_map.get("DEFAULT_URL")
189+
190+
meta_update = {
191+
"file_hash": file_hash,
192+
"file_name": file_path.name,
193+
"file_path": str(file_path),
194+
"url": url,
195+
}
196+
self.logger.debug(f"{meta_update =}")
197+
198+
split.metadata.update(meta_update)
170199

171200
self.logger.debug(f"Length of split docs: {len(splits)}")
172201
_ = store.add_documents(documents=splits)
202+
203+
204+
if __name__ == "__main__":
205+
converter = NML(embedding_model="ollama:bge-m3:latest")
206+
converter.setup()
207+
converter.create()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"DEFAULT_URL": "https://doi.org/10.7554/eLife.95135.3"}

rag_pkg/gen_rag/data/vector-stores/Readme.md

Lines changed: 0 additions & 3 deletions
This file was deleted.
Binary file not shown.

rag_pkg/gen_rag/data/vector-stores/nml-docs_bge-m3.db/chroma.sqlite3 renamed to rag_pkg/gen_rag/data/vector-stores/nml-docs_bge-m3_latest.db/chroma.sqlite3

25.2 MB
Binary file not shown.

0 commit comments

Comments
 (0)