11# Copyright (c) 2025, Moodle HQ - Research
22# SPDX-License-Identifier: BSD-3-Clause
33
4- """Util functions to proceed to index the information to Milvus collection."""
4+ """Util functions to proceed to index to some collection is a vector store / index ."""
55
66import json
77import logging
1111
1212from jsonschema import ValidationError , validate
1313from langchain_openai import OpenAIEmbeddings
14- from pymilvus import (
15- CollectionSchema ,
16- DataType ,
17- FieldSchema ,
18- Function ,
19- FunctionType ,
20- MilvusClient ,
21- )
2214from tqdm import tqdm
2315
24- import wiki_rag .index as index
16+ import wiki_rag .vector as vector
2517
2618from wiki_rag import ROOT_DIR
2719
@@ -72,52 +64,7 @@ def load_parsed_information(input_file: Path) -> dict:
7264
7365def create_temp_collection_schema (collection_name : str , embedding_dimension : int ) -> None :
7466 """Create a temporary schema for the collection."""
75- milvus = MilvusClient (index .milvus_url )
76- if milvus .has_collection (collection_name ):
77- milvus .drop_collection (collection_name )
78-
79- fields = [
80- FieldSchema (name = "id" , dtype = DataType .VARCHAR , is_primary = True , auto_id = False , max_length = 100 ),
81- FieldSchema (name = "title" , dtype = DataType .VARCHAR , max_length = 1000 ),
82- FieldSchema (name = "text" , dtype = DataType .VARCHAR , max_length = 5000 , enable_analyzer = True ,
83- analyzer_params = {"type" : "english" }, enable_match = True , ),
84- FieldSchema (name = "source" , dtype = DataType .VARCHAR , max_length = 1000 ),
85- FieldSchema (name = "dense_vector" , dtype = DataType .FLOAT_VECTOR , dim = embedding_dimension ),
86- FieldSchema (name = "sparse_vector" , dtype = DataType .SPARSE_FLOAT_VECTOR ),
87- FieldSchema (name = "parent" , dtype = DataType .VARCHAR , max_length = 100 , nullable = True ),
88- FieldSchema (name = "children" , dtype = DataType .ARRAY , element_type = DataType .VARCHAR , max_length = 4000 ,
89- max_capacity = 100 , is_array = True ),
90- FieldSchema (name = "previous" , dtype = DataType .ARRAY , element_type = DataType .VARCHAR , max_length = 4000 ,
91- max_capacity = 100 , is_array = True ),
92- FieldSchema (name = "next" , dtype = DataType .ARRAY , element_type = DataType .VARCHAR , max_length = 4000 ,
93- max_capacity = 100 , is_array = True ),
94- FieldSchema (name = "relations" , dtype = DataType .ARRAY , element_type = DataType .VARCHAR , max_length = 4000 ,
95- max_capacity = 100 , is_array = True ),
96- FieldSchema (name = "page_id" , dtype = DataType .INT32 ),
97- FieldSchema (name = "doc_id" , dtype = DataType .VARCHAR , max_length = 100 ),
98- FieldSchema (name = "doc_title" , dtype = DataType .VARCHAR , max_length = 1000 ),
99- FieldSchema (name = "doc_hash" , dtype = DataType .VARCHAR , max_length = 100 ),
100- ]
101- schema = CollectionSchema (fields )
102-
103- bm25_function = Function (
104- name = "text_bm25_emb" ,
105- input_field_names = ["text" ], # Input text field
106- output_field_names = ["sparse_vector" ], # Internal mapping sparse vector field
107- function_type = FunctionType .BM25 , # Model for processing mapping relationship
108- )
109-
110- schema .add_function (bm25_function )
111-
112- index_params = milvus .prepare_index_params ()
113- index_params .add_index (field_name = "dense_vector" , index_type = "HNSW" , metric_type = "IP" ,
114- params = {"M" : 64 , "efConstruction" : 100 })
115- index_params .add_index (field_name = "sparse_vector" , index_type = "SPARSE_INVERTED_INDEX" , metric_type = "BM25" ,
116- params = {"inverted_index_algo" : "DAAT_WAND" , "drop_ratio_build" : 0.2 })
117-
118- milvus .create_collection (collection_name , schema = schema , index_params = index_params )
119-
120- milvus .close ()
67+ vector .store .create_collection (collection_name , embedding_dimension )
12168
12269
12370def index_pages (
@@ -127,9 +74,7 @@ def index_pages(
12774 embedding_dimension : int
12875) -> list [int ]:
12976 """Index the pages to the collection."""
130- milvus = MilvusClient (index .milvus_url )
131-
132- logging .getLogger ("httpx" ).setLevel (logging .WARNING )
77+ logging .getLogger ("httpx" ).setLevel (logging .WARNING ) # Don't log (INFO) all http requests.
13378
13479 embeddings = OpenAIEmbeddings (model = embedding_model , dimensions = embedding_dimension )
13580
@@ -142,62 +87,55 @@ def index_pages(
14287 text_preamble = section ["doc_title" ]
14388 if section ["title" ] != section ["doc_title" ]:
14489 text_preamble = text_preamble + f" / { section ['title' ]} "
145- text_preamble = text_preamble .strip () + " \n \n "
90+ text_preamble = text_preamble .strip ()
14691
14792 # Calculate the complete text (preamble + text, if existing).
14893 text_content = section ["text" ] if section ["text" ] else ""
14994 if len (text_content ) > 5000 :
15095 # TODO: We need to split the text in smaller chunks here, say 2500 max or so. For now, just trim.
15196 text_content = text_content [:5000 ].strip ()
15297 logger .warning (f'Text too long for section "{ text_preamble } ", trimmed to 5000 characters.' )
153- complete_text = text_preamble + text_content
98+ complete_text = text_preamble + " \n \n " + text_content
15499 logger .debug (f"Embedding { text_preamble } , text len { len (text_content )} " )
155100
156101 dense_embedding = embeddings .embed_documents ([complete_text ])
157102 logger .debug (f"Embedding for { text_preamble } , dim len { len (dense_embedding [0 ])} " )
158- data = [
159- {
160- "id" : str (section ["id" ]),
161- "title" : section ["title" ],
162- "text" : text_content ,
163- "source" : section ["source" ],
164- "dense_vector" : dense_embedding [0 ],
165- "parent" : str (section ["parent" ]) if section ["parent" ] else None ,
166- "children" : [str (child ) for child in section ["children" ]],
167- "previous" : [str (prv ) for prv in section ["previous" ]],
168- "next" : [str (nxt ) for nxt in section ["next" ]],
169- "relations" : [str (rel ) for rel in section ["relations" ]],
170- "page_id" : int (section ["page_id" ]),
171- "doc_id" : str (section ["doc_id" ]),
172- "doc_title" : section ["doc_title" ],
173- "doc_hash" : str (section ["doc_hash" ]),
174- }
175- ]
103+ record = {
104+ "id" : str (section ["id" ]),
105+ "title" : section ["title" ],
106+ "text" : text_content ,
107+ "source" : section ["source" ],
108+ "dense_vector" : dense_embedding [0 ],
109+ "parent" : str (section ["parent" ]) if section ["parent" ] else None ,
110+ "children" : [str (child ) for child in section ["children" ]],
111+ "previous" : [str (prv ) for prv in section ["previous" ]],
112+ "next" : [str (nxt ) for nxt in section ["next" ]],
113+ "relations" : [str (rel ) for rel in section ["relations" ]],
114+ "page_id" : int (section ["page_id" ]),
115+ "doc_id" : str (section ["doc_id" ]),
116+ "doc_title" : section ["doc_title" ],
117+ "doc_hash" : str (section ["doc_hash" ]),
118+ }
176119 try :
177- milvus . insert (collection_name , data )
120+ vector . store . insert_batch (collection_name , [ record ] )
178121 num_sections += 1
179122 except Exception as e :
180123 logger .error (f"Failed to insert data: { e } " )
181124 num_pages += 1
182125
183- milvus .close ()
184126 return [num_pages , num_sections ]
185127
186128
187129def replace_previous_collection (collection_name : str , temp_collection_name : str ) -> None :
188130 """Replace the previous collection with the new one."""
189- milvus = MilvusClient (index .milvus_url )
190-
191- if not milvus .has_collection (temp_collection_name ):
131+ if not vector .store .collection_exists (temp_collection_name ):
192132 msg = f"Collection { temp_collection_name } does not exist."
193133 raise ValueError (msg )
194134
195- if milvus . has_collection (collection_name ):
196- milvus .drop_collection (collection_name )
197- milvus .rename_collection (temp_collection_name , collection_name )
135+ if vector . store . collection_exists (collection_name ):
136+ vector . store .drop_collection (collection_name )
137+ vector . store .rename_collection (temp_collection_name , collection_name )
198138
199139 # We have inserted lots of date to the collection, let's compact it.
200140 logger .info (f"Compacting collection { collection_name } " )
201- milvus .compact (collection_name )
202-
203- milvus .close ()
141+ vector .store .compact_collection (collection_name )
0 commit comments