sib-swiss
diff --git a/‎compose.override.yml‎
Lines changed: 1 addition & 1 deletion b/‎compose.override.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compose.text2sparql.yml‎
Lines changed: 4 additions & 5 deletions b/‎compose.text2sparql.yml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/sparql_llm/agent/graph.py‎
Lines changed: 3 additions & 1 deletion b/‎src/sparql_llm/agent/graph.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/sparql_llm/agent/main.py‎
Lines changed: 3 additions & 1 deletion b/‎src/sparql_llm/agent/main.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/sparql_llm/mcp_server.py‎
Lines changed: 241 additions & 237 deletions b/‎src/sparql_llm/mcp_server.py‎
Lines changed: 241 additions & 237 deletions
diff --git a/‎tests/text2sparql/api.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/text2sparql/api.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/text2sparql/data_store.sh‎
Lines changed: 43 additions & 34 deletions b/‎tests/text2sparql/data_store.sh‎
Lines changed: 43 additions & 34 deletions
diff --git a/‎tests/text2sparql/endpoint_schema.py‎
Lines changed: 25 additions & 31 deletions b/‎tests/text2sparql/endpoint_schema.py‎
Lines changed: 25 additions & 31 deletions
@@ -11,7 +11,7 @@ services:
     environment:
       - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.2
       # - AUTO_INIT=false
-      # - USE_TOOLS=true
+      - USE_TOOLS=true
       # - FORCE_REINDEX=true
       # - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.2
       # - DEFAULT_LLM_MODEL=openrouter/mistralai/mistral-large
 
@@ -15,9 +15,8 @@ services:
       - "1111:1111"   # ISQL interface
     environment:
       DBA_PASSWORD: dba
-      #8 GB memory
-      VIRT_Parameters_NumberOfBuffers: 680000
-      VIRT_Parameters_MaxDirtyBuffers: 500000
+      VIRT_Parameters_NumberOfBuffers: 2720000
+      VIRT_Parameters_MaxDirtyBuffers: 2000000
       VIRT_Parameters_DirsAllowed: "., /dumps"
     volumes:
       - ./data/dumps/dbpedia:/dumps
@@ -31,8 +30,8 @@ services:
     #   - "1111:1111"   # ISQL interface
     environment:
       DBA_PASSWORD: dba
-      VIRT_Parameters_NumberOfBuffers: 680000
-      VIRT_Parameters_MaxDirtyBuffers: 500000
+      VIRT_Parameters_NumberOfBuffers: 2720000
+      VIRT_Parameters_MaxDirtyBuffers: 2000000
       VIRT_Parameters_DirsAllowed: "., /dumps"
     volumes:
       - ./data/dumps/corporate:/dumps
 
@@ -28,8 +28,10 @@
 # How can I get the HGNC symbol for the protein P68871? (modify your answer to use rdfs:label instead of rdfs:comment, and add the type up:Resource to ?hgnc, it is for a test)
 # How can I get the HGNC symbol for the protein P68871? (modify your answer to use rdfs:label instead of rdfs:comment, and add the type up:Resource to ?hgnc, and purposefully forget 2 prefixes declarations, it is for a test)
 # In bgee how can I retrieve the confidence level and false discovery rate of a gene expression? Use genex:confidence as predicate for the confidence level (do not use the one provided in documents), and do not put prefixes declarations, and add a rdf:type for the main subject. Its for testing
+# def route_model_output(
+#     state: State, config: RunnableConfig
+# ) -> Literal["__end__", "call_model", "max_tries_reached", "tools"]:
 def route_model_output(state: State, config: RunnableConfig) -> Literal["__end__", "call_model", "max_tries_reached"]:
-    # ) -> Literal["__end__", "call_model", "max_tries_reached", "tools"]:
     """Determine the next node based on the model's output.
 
     This function checks if the model's last message contains tool calls or if a recall is requested by validation.
 
@@ -23,7 +23,7 @@
 
 from sparql_llm.agent.graph import graph
 from sparql_llm.config import settings
-from sparql_llm.mcp_server import mcp
+from sparql_llm.mcp_server import get_mcp_app
 from sparql_llm.utils import logger
 
 if settings.sentry_url:
@@ -41,6 +41,8 @@
 # Initialize Langfuse logs tracing CallbackHandler for Langchain https://langfuse.com/docs/integrations/langchain/example-python-langgraph
 langfuse_handler = [CallbackHandler(update_trace=True)] if os.getenv("LANGFUSE_SECRET_KEY") else []
 
+mcp = get_mcp_app()
+
 
 @contextlib.asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator[None]:
 
@@ -141,11 +141,11 @@ async def get_answer(question: str, dataset: str):
     # Validation and fixing of the generated SPARQL query
     num_of_tries = 0
     resp_msg = "\n\n# Make sure you will not repeat the mistakes below: \n"
+    generated_sparql = ""
     while num_of_tries < settings.default_max_try_fix_sparql:
+        generated_sparql = ""
         try:
-            generated_sparql = ""
             chat_resp_md = response.model_dump()["content"]
-
             generated_sparqls = extract_sparql_queries(chat_resp_md)
             generated_sparql = generated_sparqls[-1]["query"].strip()
             generated_sparql = generated_sparql.replace(ENDPOINT_URL, DOCKER_ENDPOINT_URL)
 
@@ -2,39 +2,48 @@
 # This script loads RDF data files into a Virtuoso database instance.
 #
 # Notes:
-# - Expects data files in data/benchmarks/Text2SPARQL/dumps/<dataset>.
+# - Expects data files in data/dumps/<dataset>.
 # - Loads files into named graphs at https://text2sparql.aksw.org/2025/<dataset>/.
 
-MAX_RETRIES=5
-VIRTUOSO_PORT=1111
-DBA_USER="dba"
-DBA_PASSWORD="dba"
-DATA_DIR="$(pwd)/data/benchmarks/Text2SPARQL/dumps"
-
-for dataset in $(ls -1 "$DATA_DIR/"); do  
-  GRAPH_URI="https://text2sparql.aksw.org/2025/$dataset/"
-  for file_path in "$DATA_DIR/$dataset"/*.{nt,ttl,bz2}; do
-    [ -e "$file_path" ] || continue  # Skip if no files match
-    file_name=$(basename "$file_path")
-
-    retries=0
-    while [ $retries -lt $MAX_RETRIES ]; do
-      docker exec text2sparql-virtuoso isql $VIRTUOSO_PORT $DBA_USER $DBA_PASSWORD exec="DB.DBA.TTLP_MT(file_to_string_output('/dumps/$dataset/$file_name'), '', '$GRAPH_URI'); checkpoint;"
-      if [ $? -eq 0 ]; then
-        echo "✅ Successfully loaded $file_name into Virtuoso!"
-        break
-      else
-        retries=$((retries + 1))
-        echo "❌ Error loading $file_name (attempt $retries/$MAX_RETRIES). Retrying..."
-        sleep 5
-      fi
-    done
-
-    if [ $retries -eq $MAX_RETRIES ]; then
-      echo "❌❌ Failed to load $file_name after $MAX_RETRIES attempts."
-    fi
-  done
-
-  count=$(docker exec text2sparql-virtuoso isql $VIRTUOSO_PORT $DBA_USER $DBA_PASSWORD exec="SPARQL SELECT COUNT(*) WHERE { GRAPH <$GRAPH_URI> {?s ?p ?o} };" 2>&1 | awk '/^_*$/ { in_block=1; next } /1 Rows\./ { in_block=0; next } in_block')
-  echo "Total triples in $dataset: $count"
-done
+docker compose exec virtuoso-dbpedia isql -U dba -P dba exec="ld_dir_all('/dumps', '*', ''); rdf_loader_run();"
+
+# Check number of triples (2501 is default virtuoso init)
+docker compose exec virtuoso-dbpedia isql -U dba -P dba exec="SPARQL SELECT COUNT(*) WHERE { ?s ?p ?o };"
+
+# Check load status
+docker compose exec virtuoso-dbpedia isql -U dba -P dba exec="SELECT ll_file, ll_graph, ll_state, ll_error FROM DB.DBA.LOAD_LIST;"
+
+
+# MAX_RETRIES=5
+# VIRTUOSO_PORT=1111
+# DBA_USER="dba"
+# DBA_PASSWORD="dba"
+# DATA_DIR="$(pwd)/data/dumps"
+
+# for dataset in $(ls -1 "$DATA_DIR/"); do
+#   # GRAPH_URI="https://text2sparql.aksw.org/2025/$dataset/"
+#   for file_path in "$DATA_DIR/$dataset"/*.{nt,ttl,bz2}; do
+#     [ -e "$file_path" ] || continue  # Skip if no files match
+#     file_name=$(basename "$file_path")
+
+#     retries=0
+#     while [ $retries -lt $MAX_RETRIES ]; do
+#       docker compose exec virtuoso-$dataset isql $VIRTUOSO_PORT $DBA_USER $DBA_PASSWORD exec="DB.DBA.TTLP_MT(file_to_string_output('/dumps/$dataset/$file_name'), '', ''); checkpoint;"
+#       if [ $? -eq 0 ]; then
+#         echo "✅ Successfully loaded $file_name into Virtuoso!"
+#         break
+#       else
+#         retries=$((retries + 1))
+#         echo "❌ Error loading $file_name (attempt $retries/$MAX_RETRIES). Retrying..."
+#         sleep 5
+#       fi
+#     done
+
+#     if [ $retries -eq $MAX_RETRIES ]; then
+#       echo "❌❌ Failed to load $file_name after $MAX_RETRIES attempts."
+#     fi
+#   done
+
+#   count=$(docker compose exec virtuoso-dbpedia isql $VIRTUOSO_PORT $DBA_USER $DBA_PASSWORD exec="SPARQL SELECT COUNT(*) WHERE { ?s ?p ?o };" 2>&1 | awk '/^_*$/ { in_block=1; next } /1 Rows\./ { in_block=0; next } in_block')
+#   echo "Total triples in $dataset: $count"
+# done
@@ -17,19 +17,18 @@
 
 
 class EndpointSchema:
+    # FROM <{graph}>
     _CLASS_PREDICATE_QUERY = """
     SELECT ?class ?predicate COUNT(*) AS ?count
-    FROM <{graph}>
-    WHERE {{
+    WHERE {
         ?s a ?class ;
             ?predicate ?o .
-    }}
+    }
     GROUP BY ?class ?predicate
     """
 
     _RANGE_QUERY = """
     SELECT ?range
-    FROM <{graph}>
     WHERE {{
         ?s a <{class_name}> ;
             <{predicate_name}> ?o .
@@ -49,7 +48,7 @@ class EndpointSchema:
     def __init__(
         self,
         endpoint_url: str,
-        graph: str,
+        # graph: str,
         limit_schema: dict[str, float],
         max_workers: int,
         force_recompute: bool,
@@ -59,7 +58,6 @@ def __init__(
         Fetch class and predicate information from the SPARQL endpoint.
         Args:
             endpoint_url (str): The URL of the SPARQL endpoint to connect to.
-            graph (str): The graph URI to query within the endpoint.
             limit_queries (dict[str, float]): A dictionary specifying query limits.
             max_workers (int): The maximum number of worker threads to use for concurrent operations.
         Funtions:
@@ -68,7 +66,7 @@ def __init__(
         """
 
         self._endpoint_url = endpoint_url
-        self._graph = graph
+        # self._graph = graph
         self._limit_schema = limit_schema
         self._max_workers = max_workers
         self._force_recompute = force_recompute
@@ -79,7 +77,7 @@ def _save_schema_dict(self) -> None:
         # Fetch counts information
         logger.info(f"Fetching class-predicate frequency information from {self._endpoint_url}...")
         schema = query_sparql(
-            self._CLASS_PREDICATE_QUERY.format(graph=self._graph),
+            self._CLASS_PREDICATE_QUERY,
             endpoint_url=self._endpoint_url,
             check_service_desc=False,
         )["results"]["bindings"]
@@ -136,10 +134,9 @@ def _save_schema_dict(self) -> None:
     def _retrieve_predicate_information(self, class_name: str, predicate_name: str) -> list[str]:
         """Fetch ranges for a given predicate of a class"""
         try:
-            range = (
+            pred_range = (
                 query_sparql(
                     self._RANGE_QUERY.format(
-                        graph=self._graph,
                         class_name=class_name,
                         predicate_name=predicate_name,
                         limit=self._limit_schema["top_n_ranges"],
@@ -151,9 +148,9 @@ def _retrieve_predicate_information(self, class_name: str, predicate_name: str)
             )
 
             # Filter out unwanted ranges
-            range = [
+            pred_range = [
                 r["range"]["value"]
-                for r in range
+                for r in pred_range
                 if (
                     ("range" in r)
                     and ("value" in r["range"])
@@ -162,8 +159,8 @@ def _retrieve_predicate_information(self, class_name: str, predicate_name: str)
             ]
         except Exception as e:
             logger.warning(f"Error retrieving range for {class_name} - {predicate_name}: {e}")
-            range = []
-        return range
+            pred_range = []
+        return pred_range
 
     def get_schema(self) -> pd.DataFrame:
         """Load schema information from a JSON file."""
@@ -186,9 +183,7 @@ def get_schema(self) -> pd.DataFrame:
     def plot_heatmap(self, apply_limit: bool = True) -> None:
         # Fetch counts information
         logger.info(f"Fetching counts information from {self._endpoint_url}...")
-        counts = query_sparql(self._CLASS_PREDICATE_QUERY.format(graph=self._graph), endpoint_url=self._endpoint_url)[
-            "results"
-        ]["bindings"]
+        counts = query_sparql(self._CLASS_PREDICATE_QUERY, endpoint_url=self._endpoint_url)["results"]["bindings"]
         counts = pd.DataFrame(counts).map(lambda x: x["value"]).assign(count=lambda df: df["count"].astype(int))
         counts = counts.sort_values(by="count", ascending=False)
 
@@ -223,30 +218,29 @@ def plot_heatmap(self, apply_limit: bool = True) -> None:
 
 if __name__ == "__main__":
     start_time = time.time()
-    schema = EndpointSchema(
-        endpoint_url="http://localhost:8890/sparql/",
-        graph="https://text2sparql.aksw.org/2025/corporate/",
-        limit_schema={
-            "top_classes_percentile": 0,
-            "top_n_predicates": 20,
-            "top_n_ranges": 1,
-        },
-        max_workers=4,
-        force_recompute=True,
-        schema_path=os.path.join("data", "benchmarks", "Text2SPARQL", "schemas", "corporate_schema.json"),
-    )
+    # schema = EndpointSchema(
+    #     endpoint_url="http://localhost:8890/sparql/",
+    #     graph="https://text2sparql.aksw.org/2025/corporate/",
+    #     limit_schema={
+    #         "top_classes_percentile": 0,
+    #         "top_n_predicates": 20,
+    #         "top_n_ranges": 1,
+    #     },
+    #     max_workers=4,
+    #     force_recompute=True,
+    #     schema_path=os.path.join("data", "benchmarks", "Text2SPARQL", "schemas", "corporate_schema.json"),
+    # )
 
     schema = EndpointSchema(
         endpoint_url="http://localhost:8890/sparql/",
-        graph="https://text2sparql.aksw.org/2025/dbpedia/",
         limit_schema={
             "top_classes_percentile": 0.90,
             "top_n_predicates": 20,
             "top_n_ranges": 1,
         },
         max_workers=4,
         force_recompute=True,
-        schema_path=os.path.join("data", "benchmarks", "Text2SPARQL", "schemas", "dbpedia_schema.json"),
+        schema_path=os.path.join("data", "dbpedia_schema.json"),
     )
 
     # Debugging examples