feat(Vector Stores): Move the indexer and the searcher to use pluggable stores.

stronk7 · stronk7 · commit a49c037732e2 · 2025-11-02T18:43:37.000+01:00
diff --git a/wiki_rag/index/__init__.py b/wiki_rag/index/__init__.py
@@ -2,5 +2,3 @@
 #  SPDX-License-Identifier: BSD-3-Clause
 
 """wiki_rag.index package."""
-
-milvus_url: str = ""  # Default Milvus URL, to be shared across the package.
diff --git a/wiki_rag/index/main.py b/wiki_rag/index/main.py
@@ -11,7 +11,7 @@
 
 from dotenv import load_dotenv
 
-import wiki_rag.index as index
+import wiki_rag.vector as vector
 
 from wiki_rag import LOG_LEVEL, ROOT_DIR, __version__
 from wiki_rag.index.util import (
@@ -21,6 +21,7 @@
     replace_previous_collection,
 )
 from wiki_rag.util import setup_logging
+from wiki_rag.vector import load_vector_store
 
 
 def main():
@@ -70,10 +71,10 @@ def main():
         logger.error("Collection name not found in environment. Exiting.")
         sys.exit(1)
 
-    index.milvus_url = os.getenv("MILVUS_URL")
-    if not index.milvus_url:
-        logger.error("Milvus URL not found in environment. Exiting.")
-        sys.exit(1)
+    index_vendor = os.getenv("INDEX_VENDOR")
+    if not index_vendor:
+        logger.warning("Index vendor (INDEX_VENDOR) not found in environment. Defaulting to 'milvus'.")
+        index_vendor = "milvus"
 
     user_agent = os.getenv("USER_AGENT")
     if not user_agent:
@@ -92,6 +93,8 @@ def main():
         sys.exit(1)
     embedding_dimensions = int(embedding_dimensions)
 
+    vector.store = load_vector_store(index_vendor)  # Set up the global wiki_rag.vector.store to be used elsewhere.
+
     input_candidate = ""
     # TODO: Implement CLI argument to accept the input file here.
 
diff --git a/wiki_rag/index/util.py b/wiki_rag/index/util.py
@@ -1,7 +1,7 @@
 #  Copyright (c) 2025, Moodle HQ - Research
 #  SPDX-License-Identifier: BSD-3-Clause
 
-"""Util functions to proceed to index the information to Milvus collection."""
+"""Util functions to proceed to index to some collection is a vector store / index."""
 
 import json
 import logging
@@ -11,17 +11,9 @@
 
 from jsonschema import ValidationError, validate
 from langchain_openai import OpenAIEmbeddings
-from pymilvus import (
-    CollectionSchema,
-    DataType,
-    FieldSchema,
-    Function,
-    FunctionType,
-    MilvusClient,
-)
 from tqdm import tqdm
 
-import wiki_rag.index as index
+import wiki_rag.vector as vector
 
 from wiki_rag import ROOT_DIR
 
@@ -72,52 +64,7 @@ def load_parsed_information(input_file: Path) -> dict:
 
 def create_temp_collection_schema(collection_name: str, embedding_dimension: int) -> None:
     """Create a temporary schema for the collection."""
-    milvus = MilvusClient(index.milvus_url)
-    if milvus.has_collection(collection_name):
-        milvus.drop_collection(collection_name)
-
-    fields = [
-        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
-        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=1000),
-        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=5000, enable_analyzer=True,
-                    analyzer_params={"type": "english"}, enable_match=True, ),
-        FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=1000),
-        FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=embedding_dimension),
-        FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
-        FieldSchema(name="parent", dtype=DataType.VARCHAR, max_length=100, nullable=True),
-        FieldSchema(name="children", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=4000,
-                    max_capacity=100, is_array=True),
-        FieldSchema(name="previous", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=4000,
-                    max_capacity=100, is_array=True),
-        FieldSchema(name="next", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=4000,
-                    max_capacity=100, is_array=True),
-        FieldSchema(name="relations", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=4000,
-                    max_capacity=100, is_array=True),
-        FieldSchema(name="page_id", dtype=DataType.INT32),
-        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
-        FieldSchema(name="doc_title", dtype=DataType.VARCHAR, max_length=1000),
-        FieldSchema(name="doc_hash", dtype=DataType.VARCHAR, max_length=100),
-    ]
-    schema = CollectionSchema(fields)
-
-    bm25_function = Function(
-        name="text_bm25_emb",
-        input_field_names=["text"],  # Input text field
-        output_field_names=["sparse_vector"],  # Internal mapping sparse vector field
-        function_type=FunctionType.BM25,  # Model for processing mapping relationship
-    )
-
-    schema.add_function(bm25_function)
-
-    index_params = milvus.prepare_index_params()
-    index_params.add_index(field_name="dense_vector", index_type="HNSW", metric_type="IP",
-                           params={"M": 64, "efConstruction": 100})
-    index_params.add_index(field_name="sparse_vector", index_type="SPARSE_INVERTED_INDEX", metric_type="BM25",
-                           params={"inverted_index_algo": "DAAT_WAND", "drop_ratio_build": 0.2})
-
-    milvus.create_collection(collection_name, schema=schema, index_params=index_params)
-
-    milvus.close()
+    vector.store.create_collection(collection_name, embedding_dimension)
 
 
 def index_pages(
@@ -127,9 +74,7 @@ def index_pages(
         embedding_dimension: int
 ) -> list[int]:
     """Index the pages to the collection."""
-    milvus = MilvusClient(index.milvus_url)
-
-    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)  # Don't log (INFO) all http requests.
 
     embeddings = OpenAIEmbeddings(model=embedding_model, dimensions=embedding_dimension)
 
@@ -142,62 +87,55 @@ def index_pages(
             text_preamble = section["doc_title"]
             if section["title"] != section["doc_title"]:
                 text_preamble = text_preamble + f" / {section['title']}"
-            text_preamble = text_preamble.strip() + "\n\n"
+            text_preamble = text_preamble.strip()
 
             # Calculate the complete text (preamble + text, if existing).
             text_content = section["text"] if section["text"] else ""
             if len(text_content) > 5000:
                 # TODO: We need to split the text in smaller chunks here, say 2500 max or so. For now, just trim.
                 text_content = text_content[:5000].strip()
                 logger.warning(f'Text too long for section "{text_preamble}", trimmed to 5000 characters.')
-            complete_text = text_preamble + text_content
+            complete_text = text_preamble + "\n\n" + text_content
             logger.debug(f"Embedding {text_preamble}, text len {len(text_content)}")
 
             dense_embedding = embeddings.embed_documents([complete_text])
             logger.debug(f"Embedding for {text_preamble}, dim len {len(dense_embedding[0])}")
-            data = [
-                {
-                    "id": str(section["id"]),
-                    "title": section["title"],
-                    "text": text_content,
-                    "source": section["source"],
-                    "dense_vector": dense_embedding[0],
-                    "parent": str(section["parent"]) if section["parent"] else None,
-                    "children": [str(child) for child in section["children"]],
-                    "previous": [str(prv) for prv in section["previous"]],
-                    "next": [str(nxt) for nxt in section["next"]],
-                    "relations": [str(rel) for rel in section["relations"]],
-                    "page_id": int(section["page_id"]),
-                    "doc_id": str(section["doc_id"]),
-                    "doc_title": section["doc_title"],
-                    "doc_hash": str(section["doc_hash"]),
-                }
-            ]
+            record = {
+                "id": str(section["id"]),
+                "title": section["title"],
+                "text": text_content,
+                "source": section["source"],
+                "dense_vector": dense_embedding[0],
+                "parent": str(section["parent"]) if section["parent"] else None,
+                "children": [str(child) for child in section["children"]],
+                "previous": [str(prv) for prv in section["previous"]],
+                "next": [str(nxt) for nxt in section["next"]],
+                "relations": [str(rel) for rel in section["relations"]],
+                "page_id": int(section["page_id"]),
+                "doc_id": str(section["doc_id"]),
+                "doc_title": section["doc_title"],
+                "doc_hash": str(section["doc_hash"]),
+            }
             try:
-                milvus.insert(collection_name, data)
+                vector.store.insert_batch(collection_name, [record])
                 num_sections += 1
             except Exception as e:
                 logger.error(f"Failed to insert data: {e}")
         num_pages += 1
 
-    milvus.close()
     return [num_pages, num_sections]
 
 
 def replace_previous_collection(collection_name: str, temp_collection_name: str) -> None:
     """Replace the previous collection with the new one."""
-    milvus = MilvusClient(index.milvus_url)
-
-    if not milvus.has_collection(temp_collection_name):
+    if not vector.store.collection_exists(temp_collection_name):
         msg = f"Collection {temp_collection_name} does not exist."
         raise ValueError(msg)
 
-    if milvus.has_collection(collection_name):
-        milvus.drop_collection(collection_name)
-    milvus.rename_collection(temp_collection_name, collection_name)
+    if vector.store.collection_exists(collection_name):
+        vector.store.drop_collection(collection_name)
+    vector.store.rename_collection(temp_collection_name, collection_name)
 
     # We have inserted lots of date to the collection, let's compact it.
     logger.info(f"Compacting collection {collection_name}")
-    milvus.compact(collection_name)
-
-    milvus.close()
+    vector.store.compact_collection(collection_name)
diff --git a/wiki_rag/search/main.py b/wiki_rag/search/main.py
@@ -17,11 +17,12 @@
 from langchain_core.messages import AIMessageChunk
 from langfuse.langchain import CallbackHandler
 
-import wiki_rag.index as index
+import wiki_rag.vector as vector
 
 from wiki_rag import LOG_LEVEL, ROOT_DIR, __version__
 from wiki_rag.search.util import ContextSchema, build_graph
 from wiki_rag.util import setup_logging
+from wiki_rag.vector import load_vector_store
 
 
 async def run():
@@ -71,10 +72,10 @@ async def run():
         logger.error("Collection name not found in environment. Exiting.")
         sys.exit(1)
 
-    index.milvus_url = os.getenv("MILVUS_URL")
-    if not index.milvus_url:
-        logger.error("Milvus URL not found in environment. Exiting.")
-        sys.exit(1)
+    index_vendor = os.getenv("INDEX_VENDOR")
+    if not index_vendor:
+        logger.warning("Index vendor (INDEX_VENDOR) not found in environment. Defaulting to 'milvus'.")
+        index_vendor = "milvus"
 
     # If LangSmith tracing is enabled, put a name for the project and verify that all required env vars are set.
     if os.getenv("LANGSMITH_TRACING", "false") == "true":
@@ -155,6 +156,8 @@ async def run():
 
     contextualisation_model = os.getenv("CONTEXTUALISATION_MODEL")
 
+    vector.store = load_vector_store(index_vendor)  # Set up the global wiki_rag.vector.store to be used elsewhere.
+
     # Let's accept arg[1] as the question to be asked.
     parser = argparse.ArgumentParser()
     parser.add_argument("question", nargs="+", help="The question to be asked.")
diff --git a/wiki_rag/search/util.py b/wiki_rag/search/util.py
@@ -17,7 +17,7 @@
     MessagesPlaceholder,
     SystemMessagePromptTemplate,
 )
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_openai import ChatOpenAI
 from langfuse import Langfuse
 from langfuse.langchain import CallbackHandler
 from langfuse.model import TextPromptClient
@@ -26,9 +26,8 @@
 from langgraph.graph.state import CompiledStateGraph
 from langgraph.runtime import Runtime
 from langsmith.client import Client
-from pymilvus import AnnSearchRequest, MilvusClient, WeightedRanker
 
-import wiki_rag.index as index
+import wiki_rag.vector as vector
 
 from wiki_rag import LOG_LEVEL
 
@@ -346,79 +345,18 @@ async def contextualise_question(
 
 
 async def retrieve(state: RagState, runtime: Runtime[ContextSchema]) -> dict:
-    """Retrieve the best matches from the indexed database.
-
-    Here we'll be using Milvus hybrid search that performs a vector search (dense, embeddings)
-    and a BM25 search (sparse, full text). And then will rerank results with the weighted
-    reranker.
-    """
-    # Note that here we are using the Milvus own library instead of the LangChain one because
-    # the LangChain one doesn't support many of the features used here.
-    embeddings = OpenAIEmbeddings(
-        model=runtime.context["embedding_model"],
-        dimensions=runtime.context["embedding_dimension"]
-    )
-    query_embedding = embeddings.embed_query(state["question"])
-
-    milvus = MilvusClient(index.milvus_url)
-
-    # TODO: Make a bunch of the defaults used here configurable.
-    dense_search_limit = 20
-    sparse_search_limit = 20
-    sparse_search_drop_ratio = 0.2
-    hybrid_rerank_limit = 30
-    rerank_weights = (0.7, 0.3)
-
-    # Define the dense search and its parameters.
-    dense_search_params = {
-        "metric_type": "IP",
-        "params": {
-            "ef": dense_search_limit,
-        }
-    }
-    dense_search = AnnSearchRequest(
-        [query_embedding], "dense_vector", dense_search_params, limit=dense_search_limit,
-    )
-
-    # Define the sparse search and its parameters.
-    sparse_search_params = {
-        "metric_type": "BM25",
-        "drop_ratio_search": sparse_search_drop_ratio,
-    }
-    sparse_search = AnnSearchRequest(
-        [state["question"]], "sparse_vector", sparse_search_params, limit=sparse_search_limit,
-    )
-
-    # Perform the hybrid search.
-    retrieved_docs = milvus.hybrid_search(
-        runtime.context["collection_name"],
-        [dense_search, sparse_search],
-        WeightedRanker(*rerank_weights),
-        limit=hybrid_rerank_limit,
-        output_fields=[
-            "id",
-            "title",
-            "text",
-            "source",
-            "doc_id",
-            "doc_title",
-            "doc_hash",
-            "parent",
-            "children",
-            "previous",
-            "next",
-            "relations",
-            "page_id",
-        ]
+    """Retrieve the best matches from the indexed database."""
+    results = vector.store.retrieve(
+        collection_name=runtime.context["collection_name"],
+        embedding_model=runtime.context["embedding_model"],
+        embedding_dimensions=runtime.context["embedding_dimension"],
+        query=state["question"],
     )
-    milvus.close()
 
     # TODO: Return only the docs which distance is below the cutoff.
-    # distance_cutoff = config["configurable"]["search_distance_cutoff"]
+    # distance_cutoff = runtime.context["search_distance_cutoff"]
     # return {"vector_search": [doc for doc in retrieved_docs[0] if doc["distance"] >= distance_cutoff]}
-    results = [dict(doc) for doc in retrieved_docs[0]]  # Need this: Langfuse has problems with Milvus Hit objects.
-    return {"vector_search": results}                   # those are UserDict objects, hence, not json-serializable.
-    # Reported @ https://github.com/langfuse/langfuse/issues/9294 , we'll need to keep the workaround, it seems.
+    return {"vector_search": results}
 
 
 async def optimise(state: RagState, runtime: Runtime[ContextSchema]) -> dict:
@@ -555,7 +493,7 @@ def retrieve_all_elements(retrieved_docs, context_list, collection_name: str) ->
             context_texts[id] = f"{retrieved[0]["entity"]["title"]}\n\n{retrieved[0]["entity"]["text"]}"
         else:
             context_texts[id] = None
-            # If not, let's retrieve it from the milvus collection.
+            # If not, let's accumulate it for later id based retrieval.
             context_missing.append(id)
 
     missing_docs = get_missing_from_vector_store(context_missing, collection_name)
@@ -573,16 +511,7 @@ def get_missing_from_vector_store(context_missing: list, collection_name: str) -
     if not context_missing:  # No missing elements, nothing extra to retrieve.
         return {}
 
-    milvus = MilvusClient(index.milvus_url)
-
-    # Let's find in the collection, the missing elements and get their titles and texts.
-    missing_docs_db = milvus.query(
-        collection_name,
-        ids=context_missing,
-        output_fields=["id", "title", "text"])
-    missing_docs = {doc["id"]: f"{doc["title"]}\n\n{doc["text"]}" for doc in missing_docs_db}
-    milvus.close()
-    return missing_docs
+    return vector.store.get_documents_contents_by_id(collection_name, context_missing)
 
 
 async def generate(state: RagState, runtime: Runtime[ContextSchema]) -> dict: