chore: fix indexing in prod when using multiple worker

vemonet · vemonet · commit 586e54dc24d1 · 2026-01-13T14:44:12.000+01:00
diff --git a/.github/deploy.sh b/.github/deploy.sh
@@ -9,20 +9,20 @@
 # ssh expasychat
 ## Just restart:
 # sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml up --force-recreate -d'
+
 ## Pull and restart:
 # sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; git pull ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml up --force-recreate -d'
-## Show logs:
-# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml logs'
 
-## Delete the vector database to re-index from scratch:
-# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; rm -rf data/qdrant/ data/endpoints_metadata.json'
-# Careful as you will need to first run it without compose.prod.yml to regenerate the index, the multiple workers in prod will conflict when re-indexing from scratch
+## Pull, build and restart:
+# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; git pull ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml up --force-recreate --build -d'
 
+## Re-index the endpoints in running deployment:
+# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml exec api uv run src/sparql_llm/agent/indexing/index_resources.py'
 
-## Re-index without restarting
-# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose exec api uv run src/expasy_agent/indexing/index_resources.py'
+## Show logs:
+# sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; XDG_RUNTIME_DIR=/run/user/1001 podman-compose -f compose.prod.yml logs'
 
-# Check env variables
+## Check env variables
 # sudo -u podman bash -c 'cd /var/containers/podman/sparql-llm ; vim .env'
 
 # NOTE: if OOM error, check `dmesg` on server and search for `oom`
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -17,7 +17,8 @@ jobs:
       matrix:
         # os: ["ubuntu-latest", "windows-latest", "macos-latest"]
         os: ["ubuntu-latest"]
-        python-version: ["3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.11", "3.12", "3.13"]
+        # onnxruntime not available for "3.14"
 
     steps:
     - uses: actions/checkout@v6
diff --git a/README.md b/README.md
@@ -263,8 +263,8 @@ Requirements: Docker, nodejs (to build the frontend), and optionally [`uv`](http
    CHAT_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS
    LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API
 
-   OPENAI_API_KEY=sk-proj-YYY
    OPENROUTER_API_KEY=sk-YYY
+   OPENAI_API_KEY=sk-proj-YYY
 
    LANGFUSE_HOST=https://cloud.langfuse.com
    LANGFUSE_PUBLIC_KEY=
@@ -292,13 +292,19 @@ Requirements: Docker, nodejs (to build the frontend), and optionally [`uv`](http
    * OpenAPI Swagger UI available at http://localhost:8000/docs
    * Vector database dashboard UI available at http://localhost:6333/dashboard
 
-   In production, you will need to make some changes to the `compose.prod.yml` file to adapt it to your server/proxy setup:
+   **In production**, you will need to make some changes to the `compose.prod.yml` file to adapt it to your server/proxy setup:
 
    ```bash
    docker compose -f compose.prod.yml up
    ```
 
-   > All data from the containers are stored persistently in the `data` folder (e.g. vectordb indexes)
+   Then run the indexing script manually from within the container to index the SPARQL endpoints (need to do it once):
+
+   ```sh
+   docker compose -f compose.prod.yml exec api uv run src/sparql_llm/agent/indexing/index_resources.py
+   ```
+
+   > All data from the containers are stored persistently in the `data` folder (e.g. vectordb indexes and endpoints metadata)
 
 > [!NOTE]
 >
diff --git a/compose.prod.yml b/compose.prod.yml
@@ -17,6 +17,8 @@ services:
       - 1.1.1.1
     ports:
       - 80:80
+    environment:
+      - AUTO_INIT=false
     # environment:
     #   - VECTORDB_URL=http://vectordb:6334/
       # NOTE: dirty hack to fix a bug with podman internal network on prod server
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,13 +34,13 @@ dependencies = [
     "SPARQLWrapper >=2.0.0",
     "beautifulsoup4 >=4.13.0",
     "curies >=0.11.0",
-    "langchain-core >=1.2.6",
-    "mcp >=1.25.0",
+    "mcp >=1.25.0,<2",
     "qdrant-client >=1.16.2",
     "fastembed >=0.7.4",
-    "langgraph >=1.0.5",
+    "langchain-core >=1.2.6",
     "langchain-qdrant >=1.1.0",
     "langchain-community >=0.4.1",
+    "langgraph >=1.0.5",
     "markdownify >=1.1.0",
     "pandas >=2.2.3",
 ]
diff --git a/src/sparql_llm/agent/config.py b/src/sparql_llm/agent/config.py
@@ -127,6 +127,8 @@ class Settings(BaseSettings):
     embedding_model: str = "BAAI/bge-small-en-v1.5"
     embedding_dimensions: int = 384
     force_index: bool = False
+    # Automatically initialize the vector store client, should be False when deploying in prod with multiple workers
+    auto_init: bool = True
 
     # Sparse embeddings are only used for the entities resolution
     sparse_embedding_model: str = "Qdrant/bm25"
diff --git a/src/sparql_llm/agent/indexing/index_resources.py b/src/sparql_llm/agent/indexing/index_resources.py
@@ -157,8 +157,6 @@ def load_resources(file: str = "expasy_resources_metadata.csv") -> list[Document
 def init_vectordb() -> None:
     """Initialize the vectordb with example queries and ontology descriptions from the SPARQL endpoints"""
     docs: list[Document] = []
-
-    # endpoints_urls = [endpoint["endpoint_url"] for endpoint in settings.endpoints]
     prefix_map, _void_schema = get_prefixes_and_schema_for_endpoints(settings.endpoints)
 
     # Gets documents from the SPARQL endpoints
diff --git a/src/sparql_llm/mcp_server.py b/src/sparql_llm/mcp_server.py
@@ -31,15 +31,21 @@
 )
 
 # Check if the docs collection exists and has data, initialize if not
+# In prod with multiple workers, auto_init should be set to False to avoid race conditions
 try:
-    collection_exists = qdrant_client.collection_exists(settings.docs_collection_name)
-    if (
+    collection_needs_init = (
         settings.force_index
         or not qdrant_client.collection_exists(settings.docs_collection_name)
         or not qdrant_client.get_collection(settings.docs_collection_name).points_count
-    ):
+    )
+    if settings.auto_init and collection_needs_init:
         logger.info("📊 Initializing vectordb...")
         init_vectordb()
+    elif not settings.auto_init and collection_needs_init:
+        logger.warning(
+            f"⚠️ Collection '{settings.docs_collection_name}' does not exist or is empty. Run the following command to initialize it:\n"
+            "docker compose -f compose.prod.yml exec api uv run src/sparql_llm/agent/indexing/index_resources.py"
+        )
     else:
         logger.info(
             f"✅ Collection '{settings.docs_collection_name}' exists with {qdrant_client.get_collection(settings.docs_collection_name).points_count} points. Skipping initialization."
diff --git a/src/sparql_llm/utils.py b/src/sparql_llm/utils.py
@@ -72,6 +72,7 @@ def get_prefixes_and_schema_for_endpoints(
         )
         logger.info(f"Fetching {endpoint['endpoint_url']} metadata...")
         prefixes_map = get_prefixes_for_endpoint(endpoint["endpoint_url"], endpoint.get("examples_file"), prefixes_map)
+    # Cache the metadata in a JSON file
     with open(ENDPOINTS_METADATA_FILE, "w") as f:
         json.dump({"prefixes_map": prefixes_map, "classes_schema": endpoints_void_dict}, f, indent=2)
     return prefixes_map, endpoints_void_dict

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ def get_prefixes_and_schema_for_endpoints(`
`72`	`72`	`)`
`73`	`73`	`logger.info(f"Fetching {endpoint['endpoint_url']} metadata...")`
`74`	`74`	`prefixes_map = get_prefixes_for_endpoint(endpoint["endpoint_url"], endpoint.get("examples_file"), prefixes_map)`
	`75`	`+ # Cache the metadata in a JSON file`
`75`	`76`	`with open(ENDPOINTS_METADATA_FILE, "w") as f:`
`76`	`77`	`json.dump({"prefixes_map": prefixes_map, "classes_schema": endpoints_void_dict}, f, indent=2)`
`77`	`78`	`return prefixes_map, endpoints_void_dict`