refactor: improve handling of endpoints metadata to lazy load them

vemonet · vemonet · commit 560a5d948ad3 · 2026-01-21T13:09:35.000+01:00
diff --git a/compose.override.yml b/compose.override.yml
@@ -8,8 +8,8 @@ services:
   api:
     ports:
       - 8000:8000
-    # environment:
-    #   - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.1
+    environment:
+      - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.1
       # - USE_TOOLS=true
       # - FORCE_REINDEX=true
       # - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,14 +38,14 @@ dependencies = [
     "qdrant-client >=1.16.2",
     "fastembed >=0.7.4",
     "langchain-core >=1.2.6",
-    "langgraph >=1.0.5",
     "markdownify >=1.1.0",
     "pandas >=2.2.3",
 ]
 
 [project.optional-dependencies]
 agent = [
     # LangGraph dependencies
+    "langgraph >=1.0.5",
     "langchain >=1.2.0",
     "langchain-openai >=1.1.6",
     # "langchain-azure-ai >=0.1.0",
diff --git a/src/sparql_llm/__init__.py b/src/sparql_llm/__init__.py
@@ -2,7 +2,8 @@
 
 __version__ = "0.1.3"
 
-from .utils import SparqlEndpointLinks, query_sparql
+from .config import SparqlEndpointLinks
+from .utils import query_sparql
 from .validate_sparql import validate_sparql, validate_sparql_in_msg, validate_sparql_with_void
 from .loaders.sparql_examples_loader import SparqlExamplesLoader
 from .loaders.sparql_void_shapes_loader import SparqlVoidShapesLoader, get_shex_dict_from_void, get_shex_from_void
diff --git a/src/sparql_llm/agent/nodes/validation.py b/src/sparql_llm/agent/nodes/validation.py
@@ -10,11 +10,9 @@
 from sparql_llm.agent.prompts import FIX_QUERY_PROMPT
 from sparql_llm.agent.state import State, StepOutput
 from sparql_llm.config import Configuration, settings
-from sparql_llm.utils import get_prefixes_and_schema_for_endpoints, query_sparql
+from sparql_llm.utils import endpoints_metadata, query_sparql
 from sparql_llm.validate_sparql import validate_sparql_in_msg
 
-prefixes_map, endpoints_void_dict = get_prefixes_and_schema_for_endpoints(settings.endpoints)
-
 
 async def validate_output(state: State, config: RunnableConfig) -> dict[str, Any]:
     """LangGraph node to validate the output of a LLM call, e.g. SPARQL queries generated.
@@ -34,7 +32,7 @@ async def validate_output(state: State, config: RunnableConfig) -> dict[str, Any
     validation_steps: list[StepOutput] = []
     recall_messages: list[HumanMessage] = []
 
-    validation_outputs = validate_sparql_in_msg(last_msg, prefixes_map, endpoints_void_dict)
+    validation_outputs = validate_sparql_in_msg(last_msg, endpoints_metadata.prefixes_map, endpoints_metadata.void_dict)
     for validation_output in validation_outputs:
         if validation_output["fixed_query"]:
             # Pass the fixed msg to the client
diff --git a/src/sparql_llm/config.py b/src/sparql_llm/config.py
@@ -6,15 +6,27 @@
 import os
 from dataclasses import dataclass, field, fields
 from pathlib import Path
-from typing import Annotated, Any, TypeVar
+from typing import Annotated, Any, Required, TypedDict, TypeVar
 
 from fastembed import TextEmbedding
 from langchain_core.runnables import RunnableConfig, ensure_config
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from qdrant_client import QdrantClient
 
 from sparql_llm.agent import prompts
-from sparql_llm.utils import SparqlEndpointLinks
+
+
+# Total=False to make all fields optional except those marked as Required
+class SparqlEndpointLinks(TypedDict, total=False):
+    """A dictionary to store links and filepaths about a SPARQL endpoint."""
+
+    endpoint_url: Required[str]
+    void_file: str | None
+    examples_file: str | None
+    homepage_url: str | None
+    label: str | None
+    description: str | None
+    # ontology_url: Optional[str]
 
 
 class Settings(BaseSettings):
diff --git a/src/sparql_llm/indexing/index_resources.py b/src/sparql_llm/indexing/index_resources.py
@@ -10,9 +10,9 @@
 from rdflib import RDF, Dataset, Namespace
 
 from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader
-from sparql_llm.config import embedding_model, qdrant_client, settings
+from sparql_llm.config import SparqlEndpointLinks, embedding_model, qdrant_client, settings
 from sparql_llm.loaders.sparql_info_loader import GENERAL_INFO_DOC_TYPE
-from sparql_llm.utils import SparqlEndpointLinks, get_prefixes_and_schema_for_endpoints
+from sparql_llm.utils import endpoints_metadata
 
 SCHEMA = Namespace("http://schema.org/")
 
@@ -160,9 +160,9 @@ def load_expasy_resources_infos(file: str = "expasy_resources_metadata.csv") ->
 
 
 def init_vectordb() -> None:
-    """Initialize the vectordb with example queries and ontology descriptions from the SPARQL endpoints"""
+    """Initialize the vectordb with example queries and ontology descriptions from the SPARQL endpoints."""
     docs: list[Document] = []
-    prefix_map, _void_schema = get_prefixes_and_schema_for_endpoints(settings.endpoints)
+    endpoints_metadata._ensure_loaded()
 
     # Gets documents from the SPARQL endpoints
     for endpoint in settings.endpoints:
@@ -174,7 +174,7 @@ def init_vectordb() -> None:
 
         docs += SparqlVoidShapesLoader(
             endpoint["endpoint_url"],
-            prefix_map=prefix_map,
+            prefix_map=endpoints_metadata.prefixes_map,
             void_file=endpoint.get("void_file"),
             examples_file=endpoint.get("examples_file"),
         ).load()
diff --git a/src/sparql_llm/loaders/sparql_info_loader.py b/src/sparql_llm/loaders/sparql_info_loader.py
@@ -1,7 +1,8 @@
 from langchain_core.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 
-from sparql_llm.utils import SparqlEndpointLinks, logger
+from sparql_llm.config import SparqlEndpointLinks
+from sparql_llm.utils import logger
 
 GENERAL_INFO_DOC_TYPE = "General information"
 
diff --git a/src/sparql_llm/loaders/sparql_void_shapes_loader.py b/src/sparql_llm/loaders/sparql_void_shapes_loader.py
@@ -39,7 +39,10 @@ def get_shex_dict_from_void(
     shex_dict = {}
 
     for subject_cls, predicates in void_dict.items():
-        if ignore_namespaces(namespaces_to_ignore, subject_cls):
+        if ignore_namespaces(namespaces_to_ignore, subject_cls) and subject_cls not in [
+            "http://www.w3.org/2002/07/owl#Class",
+            "http://www.w3.org/2000/01/rdf-schema#Class",
+        ]:
             continue
         try:
             subj = prefix_converter.compress(subject_cls, passthrough=True)
diff --git a/src/sparql_llm/mcp_server.py b/src/sparql_llm/mcp_server.py
@@ -4,10 +4,9 @@
 from mcp.server.fastmcp import FastMCP
 from qdrant_client.models import FieldCondition, Filter, MatchValue, ScoredPoint
 
-from sparql_llm.agent.nodes.validation import endpoints_void_dict, prefixes_map
 from sparql_llm.config import embedding_model, qdrant_client, settings
 from sparql_llm.indexing.index_resources import init_vectordb
-from sparql_llm.utils import logger, query_sparql
+from sparql_llm.utils import endpoints_metadata, logger, query_sparql
 from sparql_llm.validate_sparql import validate_sparql
 
 # What are the rat orthologs of the human TP53?
@@ -222,7 +221,9 @@ def execute_sparql_query(sparql_query: str, endpoint_url: str) -> str:
     """
     resp_msg = ""
     # First check if query valid based on classes schema and known prefixes
-    validation_output = validate_sparql(sparql_query, endpoint_url, prefixes_map, endpoints_void_dict)
+    validation_output = validate_sparql(
+        sparql_query, endpoint_url, endpoints_metadata.prefixes_map, endpoints_metadata.void_dict
+    )
     if validation_output["fixed_query"]:
         # Pass the fixed query to the client
         resp_msg += f"Fixed the prefixes of the generated SPARQL query automatically:\n```sparql\n{validation_output['fixed_query']}\n```\n"
@@ -256,8 +257,6 @@ def execute_sparql_query(sparql_query: str, endpoint_url: str) -> str:
     return resp_msg
 
 
-# prefixes_map, endpoints_void_dict = get_prefixes_and_schema_for_endpoints(settings.endpoints)
-
 FIX_QUERY_PROMPT = """Please fix the query, and try again.
 We suggest you to make the query less restricted, e.g. use a broader regex for string matching instead of exact match,
 ignore case, make sure you are not overriding an existing variable with BIND, or break down your query in smaller parts
diff --git a/src/sparql_llm/utils.py b/src/sparql_llm/utils.py
@@ -1,12 +1,14 @@
 import json
 import logging
 from pathlib import Path
-from typing import Any, Required, TypedDict
+from typing import Any
 
 import curies
 import httpx
 import rdflib
 
+from sparql_llm.config import SparqlEndpointLinks, settings
+
 # Disable logger in your code with logging.getLogger("sparql_llm").setLevel(logging.WARNING)
 logger = logging.getLogger("sparql_llm")
 logger.setLevel(logging.INFO)
@@ -19,19 +21,6 @@
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
 
-# Total=False to make all fields optional except those marked as Required
-class SparqlEndpointLinks(TypedDict, total=False):
-    """A dictionary to store links and filepaths about a SPARQL endpoint."""
-
-    endpoint_url: Required[str]
-    void_file: str | None
-    examples_file: str | None
-    homepage_url: str | None
-    label: str | None
-    description: str | None
-    # ontology_url: Optional[str]
-
-
 # Prefixes utilities
 
 GET_PREFIXES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
@@ -45,40 +34,6 @@ class SparqlEndpointLinks(TypedDict, total=False):
 ENDPOINTS_METADATA_FILE = Path("data") / "endpoints_metadata.json"
 
 
-def load_endpoints_metadata_file() -> tuple[dict[str, str], "EndpointsSchemaDict"]:
-    """Load prefixes and schema from the cached metadata file."""
-    try:
-        with open(ENDPOINTS_METADATA_FILE) as f:
-            data = json.load(f)
-            logger.info(
-                f"💾 Loaded endpoints metadata from {ENDPOINTS_METADATA_FILE.resolve()} for {len(data.get('classes_schema', {}))} endpoints"
-            )
-            return data.get("prefixes_map", {}), data.get("classes_schema", {})
-    except Exception as e:
-        logger.warning(f"Could not load metadata from {ENDPOINTS_METADATA_FILE}: {e}")
-        return {}, {}
-
-
-def get_prefixes_and_schema_for_endpoints(
-    endpoints: list[SparqlEndpointLinks],
-) -> tuple[dict[str, str], "EndpointsSchemaDict"]:
-    """Return a dictionary of prefixes and a dictionary of VoID classes schema for the given endpoints."""
-    prefixes_map, endpoints_void_dict = load_endpoints_metadata_file()
-    if prefixes_map and endpoints_void_dict:
-        return prefixes_map, endpoints_void_dict
-    logger.info(f"Fetching metadata for {len(endpoints)} endpoints...")
-    for endpoint in endpoints:
-        endpoints_void_dict[endpoint["endpoint_url"]] = get_schema_for_endpoint(
-            endpoint["endpoint_url"], endpoint.get("void_file")
-        )
-        logger.info(f"Fetching {endpoint['endpoint_url']} metadata...")
-        prefixes_map = get_prefixes_for_endpoint(endpoint["endpoint_url"], endpoint.get("examples_file"), prefixes_map)
-    # Cache the metadata in a JSON file
-    with open(ENDPOINTS_METADATA_FILE, "w") as f:
-        json.dump({"prefixes_map": prefixes_map, "classes_schema": endpoints_void_dict}, f, indent=2)
-    return prefixes_map, endpoints_void_dict
-
-
 def get_prefixes_for_endpoint(
     endpoint_url: str, examples_file: str | None = None, prefixes_map: dict[str, str] | None = None
 ) -> dict[str, str]:
@@ -143,33 +98,6 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: str | None = None) ->
     Formatted as: dict[subject_cls][predicate] = list[object_cls/datatype]"""
     void_dict: SchemaDict = {}
     try:
-        # if void_file:
-        #     g = rdflib.Graph()
-        #     if void_file.startswith(("http://", "https://")):
-        #         # Handle URL case
-        #         with httpx.Client() as client:
-        #             for attempt in range(10):
-        #                 # Retry a few times in case of HTTP errors, e.g. https://sparql.uniprot.org/.well-known/void/
-        #                 try:
-        #                     resp = client.get(void_file, headers={"Accept": "text/turtle"}, follow_redirects=True)
-        #                     resp.raise_for_status()
-        #                     if resp.text.strip() == "":
-        #                         raise ValueError(f"Empty response for VoID description from {void_file}")
-        #                     g.parse(data=resp.text, format="turtle")
-        #                     break
-        #                 except Exception as e:
-        #                     if attempt == 3:
-        #                         raise e
-        #                     time.sleep(1)
-        #                     continue
-        #     else:
-        #         # Handle local file case
-        #         g.parse(void_file, format="turtle")
-        #     results = g.query(GET_VOID_DESC)
-        #     bindings = [{str(k): {"value": str(v)} for k, v in row.asdict().items()} for row in results]
-        # else:
-        #     bindings = query_sparql(GET_VOID_DESC, endpoint_url)["results"]["bindings"]
-
         for void_triple in query_sparql(GET_VOID_DESC, endpoint_url, use_file=void_file, check_service_desc=True)[
             "results"
         ]["bindings"]:
@@ -192,12 +120,7 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: str | None = None) ->
     return void_dict
 
 
-# TODO: use SPARQLWrapper
-# sparqlw = SPARQLWrapper(endpoint)
-# sparqlw.setReturnFormat(JSON)
-# sparqlw.setOnlyConneg(True)
-# sparqlw.setQuery(query)
-# res = sparqlw.query().convert()
+# Use https://github.com/lu-pl/sparqlx ?
 def query_sparql(
     query: str,
     endpoint_url: str,
@@ -267,3 +190,70 @@ def query_sparql(
             if should_close:
                 client.close()
     return query_resp
+
+
+class EndpointsMetadataManager:
+    """Lazy-loading manager for endpoints metadata."""
+
+    def __init__(self, endpoints: list[SparqlEndpointLinks], auto_init: bool = True) -> None:
+        self._endpoints = endpoints
+        self._prefixes_map: dict[str, str] = {}
+        self._void_dict: EndpointsSchemaDict = {}
+        self._initialized = False
+        if auto_init:
+            self._ensure_loaded()
+
+    def _ensure_loaded(self) -> None:
+        """Load metadata if not already loaded."""
+        if self._initialized:
+            return
+        # Try loading from file first
+        try:
+            with open(ENDPOINTS_METADATA_FILE) as f:
+                data = json.load(f)
+                self._prefixes_map = data.get("prefixes_map", {})
+                self._void_dict = data.get("classes_schema", {})
+                if self._prefixes_map and self._void_dict:
+                    logger.info(
+                        f"💾 Loaded endpoints metadata from {ENDPOINTS_METADATA_FILE.resolve()} "
+                        f"for {len(self._void_dict)} endpoints"
+                    )
+                    return
+        except Exception as e:
+            logger.debug(f"Could not load metadata from {ENDPOINTS_METADATA_FILE}: {e}")
+
+        logger.info(f"Fetching metadata for {len(self._endpoints)} endpoints...")
+        for endpoint in self._endpoints:
+            self._void_dict[endpoint["endpoint_url"]] = get_schema_for_endpoint(
+                endpoint["endpoint_url"], endpoint.get("void_file")
+            )
+            logger.info(f"Fetching {endpoint['endpoint_url']} metadata...")
+            self._prefixes_map = get_prefixes_for_endpoint(
+                endpoint["endpoint_url"], endpoint.get("examples_file"), self._prefixes_map
+            )
+        # Cache to JSON file
+        with open(ENDPOINTS_METADATA_FILE, "w") as f:
+            json.dump({"prefixes_map": self._prefixes_map, "classes_schema": self._void_dict}, f, indent=2)
+        self._initialized = True
+        logger.info(f"💾 Cached endpoints metadata to {ENDPOINTS_METADATA_FILE.resolve()}")
+
+    @property
+    def prefixes_map(self) -> dict[str, str]:
+        """Get prefixes map, loading lazily if needed."""
+        self._ensure_loaded()
+        return self._prefixes_map or {}
+
+    @property
+    def void_dict(self) -> "EndpointsSchemaDict":
+        """Get endpoints VoID schema dict, loading lazily if needed."""
+        self._ensure_loaded()
+        return self._void_dict or {}
+
+    # def reset(self) -> None:
+    #     """Reset cached metadata (useful for re-initialization after init_vectordb)."""
+    #     self._prefixes_map = {}
+    #     self._void_dict = {}
+
+
+# Global instance, metadata loads lazily on first property access
+endpoints_metadata = EndpointsMetadataManager(settings.endpoints, settings.auto_init)
diff --git a/tests/benchmark_biodata.py b/tests/benchmark_biodata.py
@@ -23,7 +23,7 @@
 # from sklearn.model_selection import KFold
 from sparql_llm import SparqlExamplesLoader, SparqlVoidShapesLoader
 from sparql_llm.config import embedding_model, qdrant_client, settings
-from sparql_llm.utils import get_prefixes_and_schema_for_endpoints, query_sparql
+from sparql_llm.utils import EndpointsMetadataManager, query_sparql
 from sparql_llm.validate_sparql import extract_sparql_queries
 
 file_time_prefix = time.strftime("%Y%m%d_%H%M")
@@ -281,7 +281,8 @@ def main() -> None:
         print(f"\n\n===== Benchmarking {BOLD}{BLUE}{endpoint_url}{RESET} =====\n")
         avg_results[endpoint_url] = {}
 
-        prefix_map, _void_schema = get_prefixes_and_schema_for_endpoints([{"endpoint_url": endpoint_url}])
+        endpoints_metadata = EndpointsMetadataManager([{"endpoint_url": endpoint_url}])
+        prefix_map = endpoints_metadata.prefixes_map
         docs_examples = SparqlExamplesLoader(endpoint_url, examples_file=f"tests/data/examples_{files[i]}.ttl").load()
 
         # Use special void file for UniProt endpoint
diff --git a/tutorial/graph.py b/tutorial/graph.py
diff --git a/tutorial/index.py b/tutorial/index.py