fix: adjust RAG tool behavior and docs (#198)

yueliao11 · yueliao11 · commit d5978ab0e1b2 · 2025-12-16T23:20:28.000+08:00
diff --git a/spoon_ai/rag/config.py b/spoon_ai/rag/config.py
@@ -30,50 +30,7 @@ class RagConfig:
     rag_dir: str = ".rag_store"
 
 
-_PLACEHOLDER_PATTERNS = [
-    r"^sk-your-.*-key-here$",
-    r"^sk-your-openai-api-key-here$",
-    r"^your-.*-api-key-here$",
-    r"^your_api_key$",
-    r"^api_key_here$",
-    r"^<.*>$",
-    r"^\[.*\]$",
-    r"^\{.*\}$",
-]
-
-# Mapping of known OpenAI-compatible providers to their defaults
-# This allows using project-standard keys (e.g. DEEPSEEK_API_KEY) with RAG automatically.
-_COMPATIBLE_PROVIDERS: Dict[str, Dict[str, str]] = {
-    "deepseek": {
-        "env_key": "DEEPSEEK_API_KEY",
-        "base_url": "https://api.deepseek.com/v1",
-        "default_model": "",  # Let server decide or user override
-    },
-    "openrouter": {
-        "env_key": "OPENROUTER_API_KEY",
-        "base_url": "https://openrouter.ai/api/v1",
-        "default_model": "",
-    },
-    # Note: Gemini and Anthropic are not strictly OpenAI-compatible for embeddings (paths differ),
-    # so we do not auto-map them to AnyRoute to avoid runtime errors unless explicitly configured.
-}
-
-
-def _is_placeholder(value: Optional[str]) -> bool:
-    if not value or not isinstance(value, str):
-        return True
-    v = value.strip().lower()
-    if not v:
-        return True
-    for p in _PLACEHOLDER_PATTERNS:
-        if re.match(p, v):
-            return True
-    # Common keywords that indicate examples
-    for k in ("placeholder", "example", "sample", "demo", "insert", "replace", "change-me"):
-        if k in v:
-            return True
-    return False
-
+from spoon_ai.llm.config import ConfigurationManager
 
 def get_default_config() -> RagConfig:
     backend = os.getenv("RAG_BACKEND", "faiss").lower()
@@ -83,50 +40,66 @@ def get_default_config() -> RagConfig:
     chunk_size = int(os.getenv("CHUNK_SIZE", "800"))
     chunk_overlap = int(os.getenv("CHUNK_OVERLAP", "120"))
 
-    # Embeddings provider selection
-    embeddings_provider = None
-    
-    # 1. AnyRoute (Explicit RAG config) - Highest Priority
-    anyroute_api_key = os.getenv("ANYROUTE_API_KEY")
-    anyroute_base = os.getenv("ANYROUTE_BASE_URL")
-    anyroute_model = os.getenv("ANYROUTE_MODEL")
+    # Use LLM ConfigurationManager for standardized provider detection
+    config_manager = ConfigurationManager()
     
-    # 2. OpenAI (Native support)
-    openai_key = os.getenv("OPENAI_API_KEY")
-
-    # Logic to determine provider
-    if (anyroute_api_key and anyroute_base) and not (_is_placeholder(anyroute_api_key) or _is_placeholder(anyroute_base)):
+    # 1. Determine active provider
+    # Try ANYROUTE_API_KEY explicitly first (legacy RAG priority)
+    anyroute_key = os.getenv("ANYROUTE_API_KEY")
+    # Use static method from ConfigurationManager
+    if anyroute_key and not ConfigurationManager._is_placeholder_value(anyroute_key):
         embeddings_provider = "anyroute"
-    elif openai_key and not _is_placeholder(openai_key):
-        embeddings_provider = "openai"
+        anyroute_base = os.getenv("ANYROUTE_BASE_URL", "https://api.openai.com/v1") # Default generic
+        anyroute_model = os.getenv("ANYROUTE_MODEL")
+        openai_key = None
     else:
-        # 3. Try Auto-mapping compatible providers (DeepSeek, OpenRouter, etc.)
-        for name, defaults in _COMPATIBLE_PROVIDERS.items():
-            key_val = os.getenv(defaults["env_key"])
-            if key_val and not _is_placeholder(key_val):
-                embeddings_provider = "anyroute"
-                anyroute_api_key = key_val
-                # Use provider default base URL if explicit ANYROUTE_BASE_URL is missing
-                anyroute_base = anyroute_base or defaults["base_url"]
-                # Use provider default model if explicit ANYROUTE_MODEL is missing
-                if not anyroute_model and defaults["default_model"]:
-                    anyroute_model = defaults["default_model"]
-                break
+        # Fallback to LLM module's intelligent selection
+        # This picks defaults based on available API keys (OpenAI > Anthropic > OpenRouter...)
+        # Note: Anthropic/Gemini are not directly supported for embeddings here unless mapped
+        provider = config_manager.get_default_provider()
         
-        # 4. Fallback
-        if not embeddings_provider:
-             embeddings_provider = "hash"  # deterministic offline fallback
+        # Load full config for the selected provider
+        try:
+            llm_config = config_manager.load_provider_config(provider)
+        except Exception:
+            llm_config = None
+
+        embeddings_provider = "hash" # Default fallback
+        anyroute_key = None
+        anyroute_base = None
+        anyroute_model = None
+        openai_key = None
 
+        if llm_config:
+            if provider == "openai":
+                embeddings_provider = "openai"
+                openai_key = llm_config.api_key
+            elif provider in ("deepseek", "openrouter", "anyroute"):
+                # Map compatible OpenAI-like providers to AnyRoute client
+                embeddings_provider = "anyroute"
+                anyroute_key = llm_config.api_key
+                anyroute_base = llm_config.base_url
+                
+                # Check for explicit override or intelligent default
+                env_model = os.getenv("ANYROUTE_MODEL")
+                if env_model:
+                    anyroute_model = env_model
+                elif provider == "openrouter" and "embedding" not in llm_config.model.lower():
+                    # OpenRouter: Default to openai/text-embedding-3-small if main model is not an embedding model
+                    anyroute_model = "openai/text-embedding-3-small"
+                else:
+                    anyroute_model = llm_config.model
+    
     return RagConfig(
         backend=backend,
         collection=collection,
         top_k=top_k,
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
         embeddings_provider=embeddings_provider,
-        anyroute_api_key=None if _is_placeholder(anyroute_api_key) else anyroute_api_key,
-        anyroute_base_url=None if _is_placeholder(anyroute_base) else anyroute_base,
+        anyroute_api_key=anyroute_key,
+        anyroute_base_url=anyroute_base,
         anyroute_model=anyroute_model,
-        openai_api_key=None if _is_placeholder(openai_key) else openai_key,
+        openai_api_key=openai_key,
         rag_dir=rag_dir,
     )
diff --git a/spoon_ai/rag/vectorstores/chroma_store.py b/spoon_ai/rag/vectorstores/chroma_store.py
@@ -31,12 +31,40 @@ def _get_collection(self, name: str):
 
     def add(self, *, collection: str, ids: List[str], embeddings: List[List[float]], metadatas: List[Dict]) -> None:
         col = self._get_collection(collection)
-        col.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
+        try:
+            col.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
+        except Exception as e:
+            msg = str(e).lower()
+            if "dimension" in msg or "dimensionality" in msg:
+                raise ValueError(
+                    f"Chroma embedding dimension mismatch in collection '{collection}'. "
+                    "You may be using a different embedding model than the one used to create this collection. "
+                    f"Consider deleting the collection via `store.delete_collection('{collection}')` "
+                    "or using a new collection name."
+                ) from e
+            raise e
 
     def query(self, *, collection: str, query_embeddings: List[List[float]], top_k: int = 5, filter: Optional[Dict] = None) -> List[List[Tuple[str, float, Dict]]]:
         col = self._get_collection(collection)
-        # Chroma >=1.3 disallows requesting "ids" in include; request metadatas+distances only.
-        res = col.query(query_embeddings=query_embeddings, n_results=top_k, include=["metadatas", "distances"])
+        try:
+            # Chroma >=1.3 disallows requesting "ids" in include; request metadatas+distances only.
+            # Pass filter as 'where' clause for metadata filtering
+            res = col.query(
+                query_embeddings=query_embeddings, 
+                n_results=top_k, 
+                include=["metadatas", "distances"],
+                where=filter  # Pass explicit filter dict
+            )
+        except Exception as e:
+            msg = str(e).lower()
+            if "dimension" in msg or "dimensionality" in msg:
+                raise ValueError(
+                    f"Chroma query dimension mismatch in collection '{collection}'. "
+                    "The query embedding dimension does not match the collection's index. "
+                    "Please ensure you are using the same embedding model as when the data was ingested."
+                ) from e
+            raise e
+
         out: List[List[Tuple[str, float, Dict]]] = []
         q = len(query_embeddings)
         for i in range(q):
diff --git a/spoon_ai/rag/vectorstores/faiss_store.py b/spoon_ai/rag/vectorstores/faiss_store.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from typing import Dict, List, Optional, Tuple
 
 from .base import VectorStore
@@ -8,11 +9,73 @@
 class FaissVectorStore(VectorStore):
     """FAISS-backed local vector store (cosine via inner product + L2 norm)."""
 
-    def __init__(self) -> None:
+    def __init__(self, *, persist_dir: Optional[str] = None) -> None:
+        import os
+        self.persist_dir = persist_dir or os.getenv("RAG_FAISS_DIR", os.path.join(os.getenv("RAG_DIR", ".rag_store"), "faiss"))
         self._collections: Dict[str, Dict] = {}
+        self._load()
+
+    def _get_index_path(self, collection: str) -> str:
+        return os.path.join(self.persist_dir, f"{collection}.index")
+
+    def _get_meta_path(self, collection: str) -> str:
+        return os.path.join(self.persist_dir, f"{collection}.pkl")
+
+    def _load(self):
+        import os
+        import pickle
+        import faiss # type: ignore
+        
+        if not os.path.exists(self.persist_dir):
+            return
+
+        for fname in os.listdir(self.persist_dir):
+            if fname.endswith(".index"):
+                collection = fname[:-6]
+                index_path = os.path.join(self.persist_dir, fname)
+                meta_path = self._get_meta_path(collection)
+                
+                if not os.path.exists(meta_path):
+                    continue
+                    
+                try:
+                    index = faiss.read_index(index_path)
+                    with open(meta_path, "rb") as f:
+                        meta_data = pickle.load(f)
+                    
+                    self._collections[collection] = {
+                        "index": index,
+                        "ids": meta_data["ids"],
+                        "metas": meta_data["metas"],
+                        "dim": meta_data["dim"],
+                    }
+                except Exception as e:
+                    print(f"Error loading FAISS collection '{collection}': {e}")
+                    # Ignore corrupted files
+                    pass
+
+    def _save(self, collection: str):
+        import os
+        import pickle
+        import faiss # type: ignore
+        
+        os.makedirs(self.persist_dir, exist_ok=True)
+        col = self._collections.get(collection)
+        if not col:
+            return
+
+        index_path = self._get_index_path(collection)
+        meta_path = self._get_meta_path(collection)
+        
+        faiss.write_index(col["index"], index_path)
+        with open(meta_path, "wb") as f:
+            pickle.dump({
+                "ids": col["ids"],
+                "metas": col["metas"],
+                "dim": col["dim"]
+            }, f)
 
     def _get_or_create(self, collection: str, dim: Optional[int] = None):
-        import numpy as np  # noqa: F401
         import faiss  # type: ignore
 
         col = self._collections.get(collection)
@@ -53,11 +116,19 @@ def add(self, *, collection: str, ids: List[str], embeddings: List[List[float]],
         col["ids"].extend(ids)
         for id_, md in zip(ids, metadatas):
             col["metas"][id_] = md
+        
+        # Persist changes
+        self._save(collection)
 
     def query(self, *, collection: str, query_embeddings: List[List[float]], top_k: int = 5, filter: Optional[Dict] = None) -> List[List[Tuple[str, float, Dict]]]:
         import numpy as np
 
-        col = self._get_or_create(collection)
+        # Ensure loaded or created if not in memory (but _load handles init)
+        col = self._collections.get(collection)
+        if not col:
+            # If not in memory and not loaded, it doesn't exist
+            return [[] for _ in query_embeddings]
+
         if len(col["ids"]) == 0:
             return [[] for _ in query_embeddings]
 
@@ -85,5 +156,14 @@ def query(self, *, collection: str, query_embeddings: List[List[float]], top_k:
         return results
 
     def delete_collection(self, collection: str) -> None:
+        import os
         self._collections.pop(collection, None)
+        # Also remove from disk
+        try:
+            if os.path.exists(self._get_index_path(collection)):
+                os.remove(self._get_index_path(collection))
+            if os.path.exists(self._get_meta_path(collection)):
+                os.remove(self._get_meta_path(collection))
+        except Exception:
+            pass
 
diff --git a/spoon_ai/rag/vectorstores/pinecone_store.py b/spoon_ai/rag/vectorstores/pinecone_store.py
@@ -133,7 +133,7 @@ def _extract_index_names(obj) -> set:
     def add(self, *, collection: str, ids: List[str], embeddings: List[List[float]], metadatas: List[Dict]) -> None:
         index = self._ensure_index(dim=len(embeddings[0]) if embeddings else None)
         vectors = [
-            {"id": id_, "values": vec, "metadata": md}
+            {"id": id_, "values": [float(x) for x in vec], "metadata": md}
             for id_, vec, md in zip(ids, embeddings, metadatas)
         ]
         index.upsert(vectors=vectors, namespace=collection)
@@ -142,7 +142,8 @@ def query(self, *, collection: str, query_embeddings: List[List[float]], top_k:
         index = self._ensure_index()
         results: List[List[Tuple[str, float, Dict]]] = []
         for q in query_embeddings:
-            res = index.query(namespace=collection, vector=q, top_k=top_k, include_metadata=True)
+            # Pass filter dict directly (Pinecone uses Mongo-style filters)
+            res = index.query(namespace=collection, vector=q, top_k=top_k, include_metadata=True, filter=filter)
             matches = res.get("matches", []) if isinstance(res, dict) else getattr(res, "matches", [])
             out: List[Tuple[str, float, Dict]] = []
             for m in matches:
diff --git a/spoon_ai/rag/vectorstores/qdrant_store.py b/spoon_ai/rag/vectorstores/qdrant_store.py
@@ -61,10 +61,25 @@ def add(self, *, collection: str, ids: List[str], embeddings: List[List[float]],
 
     def query(self, *, collection: str, query_embeddings: List[List[float]], top_k: int = 5, filter: Optional[Dict] = None) -> List[List[Tuple[str, float, Dict]]]:
         client = self._client_or_raise()
+        
+        # Build Qdrant filter (dict structure to avoid imports)
+        q_filter = None
+        if filter:
+            musts = []
+            for k, v in filter.items():
+                musts.append({"key": k, "match": {"value": v}})
+            q_filter = {"must": musts}
+
         results: List[List[Tuple[str, float, Dict]]] = []
         for q in query_embeddings:
             # qdrant-client >=1.x uses query_points for vector search; ensure payload returned
-            res = client.query_points(collection_name=collection, query=q, limit=top_k, with_payload=True)
+            res = client.query_points(
+                collection_name=collection, 
+                query=q, 
+                limit=top_k, 
+                with_payload=True,
+                query_filter=q_filter
+            )
             # Normalize response
             try:
                 points = res.points  # type: ignore[attr-defined]
diff --git a/spoon_ai/rag/vectorstores/registry.py b/spoon_ai/rag/vectorstores/registry.py
diff --git a/spoon_ai/tools/rag_tools.py b/spoon_ai/tools/rag_tools.py