88Author: Ankur Sinha <sanjay DOT ankur AT gmail DOT com>
99"""
1010
11+ import json
1112import logging
1213import mimetypes
1314from glob import glob
2021 MarkdownHeaderTextSplitter ,
2122 RecursiveCharacterTextSplitter ,
2223)
23- from neuroml_ai .rag .utils import setup_embedding
24+ from neuroml_ai_utils .llm import setup_embedding
25+
26+ logging .basicConfig (level = logging .WARNING )
2427
2528
2629class NML (object ):
30+ # limit to two header levels
2731 md_headers_to_split_on = [
2832 ("#" , "Header 1" ),
2933 ("##" , "Header 2" ),
30- ("###" , "Header 3" ),
31- ("####" , "Header 4" ),
34+ # ("###", "Header 3"),
35+ # ("####", "Header 4"),
3236 ]
3337
3438 """NeuroML vector store generator"""
3539
36- def __init__ (self , embedding_model : str , logging_level : int = logging .INFO ):
40+ def __init__ (self , embedding_model : str , logging_level : int = logging .DEBUG ):
3741 """TODO: to be defined."""
3842 self .chunk_size = 600
3943 self .chunk_overlap = 60
@@ -42,9 +46,9 @@ def __init__(self, embedding_model: str, logging_level: int = logging.INFO):
4246 my_path = Path (__file__ ).parent
4347 self .stores_sources_path = f"{ my_path } /sources"
4448
45- self .logger = logging .getLogger ("NeuroML-AI " )
49+ self .logger = logging .getLogger ("NeuroML-doc-embeddings " )
4650 self .logger .setLevel (logging_level )
47- self .logger .propagate = False
51+ self .logger .propagate = True
4852
4953 def setup (self ):
5054 """Setup embeddings"""
@@ -96,8 +100,19 @@ def create(self):
96100 client_settings = chroma_client_settings_text ,
97101 )
98102
99- info_files = glob (f"{ src } /*" , recursive = True )
103+ info_files = glob (f"{ src } /*.md" , recursive = True )
104+ url_maps = glob (f"{ src } /*.json" , recursive = True )
100105 self .logger .debug (f"Loaded { len (info_files )} files from { src } " )
106+ self .logger .debug (f"Loaded { len (url_maps )} url map files from { src } " )
107+
108+ # only a single url map file is allowed here
109+ assert len (url_maps ) <= 1
110+
111+ if len (url_maps ) == 1 :
112+ with open (url_maps [0 ], "r" ) as f :
113+ url_map_data = json .load (f )
114+ else :
115+ url_map_data = {}
101116
102117 for info_file in info_files :
103118 try :
@@ -108,7 +123,7 @@ def create(self):
108123
109124 if file_type :
110125 if "markdown" in file_type :
111- self .add_md (store , info_file )
126+ self .add_md (store , info_file , url_map_data )
112127 else :
113128 self .logger .warning (
114129 f"File { info_file } is of type { file_type } which is not currently supported. Skipping"
@@ -118,7 +133,7 @@ def create(self):
118133 f"Could not guess file type for file { info_file } . Skipping"
119134 )
120135
121- def add_md (self , store , file ):
136+ def add_md (self , store , file , url_map ):
122137 """Add a markdown file to the vector store
123138
124139 We add the file hash as extra metadata so that we can filter on it
@@ -160,13 +175,33 @@ def add_md(self, store, file):
160175 )
161176 splits = text_splitter .split_documents (md_splits )
162177 for split in splits :
163- split .metadata .update (
164- {
165- "file_hash" : file_hash ,
166- "file_name" : file_path .name ,
167- "file_path" : str (file_path ),
168- }
169- )
178+ # get url
179+ # header 1
180+ url = None
181+ if "Header 1" in split .metadata .keys ():
182+ url = url_map .get (split .metadata ["Header 1" ], None )
183+ # try header 2: more specific
184+ if "Header 2" in split .metadata .keys ():
185+ url = url_map .get (split .metadata ["Header 2" ], None )
186+ # fall back to default url
187+ if not url :
188+ url = url_map .get ("DEFAULT_URL" )
189+
190+ meta_update = {
191+ "file_hash" : file_hash ,
192+ "file_name" : file_path .name ,
193+ "file_path" : str (file_path ),
194+ "url" : url ,
195+ }
196+ self .logger .debug (f"{ meta_update = } " )
197+
198+ split .metadata .update (meta_update )
170199
171200 self .logger .debug (f"Length of split docs: { len (splits )} " )
172201 _ = store .add_documents (documents = splits )
202+
203+
204+ if __name__ == "__main__" :
205+ converter = NML (embedding_model = "ollama:bge-m3:latest" )
206+ converter .setup ()
207+ converter .create ()
0 commit comments