11import json
22import logging
33from pathlib import Path
4- from typing import Any , Required , TypedDict
4+ from typing import Any
55
66import curies
77import httpx
88import rdflib
99
10+ from sparql_llm .config import SparqlEndpointLinks , settings
11+
1012# Disable logger in your code with logging.getLogger("sparql_llm").setLevel(logging.WARNING)
1113logger = logging .getLogger ("sparql_llm" )
1214logger .setLevel (logging .INFO )
1921logging .getLogger ("httpx" ).setLevel (logging .WARNING )
2022
2123
22- # Total=False to make all fields optional except those marked as Required
23- class SparqlEndpointLinks (TypedDict , total = False ):
24- """A dictionary to store links and filepaths about a SPARQL endpoint."""
25-
26- endpoint_url : Required [str ]
27- void_file : str | None
28- examples_file : str | None
29- homepage_url : str | None
30- label : str | None
31- description : str | None
32- # ontology_url: Optional[str]
33-
34-
3524# Prefixes utilities
3625
3726GET_PREFIXES_QUERY = """PREFIX sh: <http://www.w3.org/ns/shacl#>
@@ -45,40 +34,6 @@ class SparqlEndpointLinks(TypedDict, total=False):
4534ENDPOINTS_METADATA_FILE = Path ("data" ) / "endpoints_metadata.json"
4635
4736
48- def load_endpoints_metadata_file () -> tuple [dict [str , str ], "EndpointsSchemaDict" ]:
49- """Load prefixes and schema from the cached metadata file."""
50- try :
51- with open (ENDPOINTS_METADATA_FILE ) as f :
52- data = json .load (f )
53- logger .info (
54- f"💾 Loaded endpoints metadata from { ENDPOINTS_METADATA_FILE .resolve ()} for { len (data .get ('classes_schema' , {}))} endpoints"
55- )
56- return data .get ("prefixes_map" , {}), data .get ("classes_schema" , {})
57- except Exception as e :
58- logger .warning (f"Could not load metadata from { ENDPOINTS_METADATA_FILE } : { e } " )
59- return {}, {}
60-
61-
62- def get_prefixes_and_schema_for_endpoints (
63- endpoints : list [SparqlEndpointLinks ],
64- ) -> tuple [dict [str , str ], "EndpointsSchemaDict" ]:
65- """Return a dictionary of prefixes and a dictionary of VoID classes schema for the given endpoints."""
66- prefixes_map , endpoints_void_dict = load_endpoints_metadata_file ()
67- if prefixes_map and endpoints_void_dict :
68- return prefixes_map , endpoints_void_dict
69- logger .info (f"Fetching metadata for { len (endpoints )} endpoints..." )
70- for endpoint in endpoints :
71- endpoints_void_dict [endpoint ["endpoint_url" ]] = get_schema_for_endpoint (
72- endpoint ["endpoint_url" ], endpoint .get ("void_file" )
73- )
74- logger .info (f"Fetching { endpoint ['endpoint_url' ]} metadata..." )
75- prefixes_map = get_prefixes_for_endpoint (endpoint ["endpoint_url" ], endpoint .get ("examples_file" ), prefixes_map )
76- # Cache the metadata in a JSON file
77- with open (ENDPOINTS_METADATA_FILE , "w" ) as f :
78- json .dump ({"prefixes_map" : prefixes_map , "classes_schema" : endpoints_void_dict }, f , indent = 2 )
79- return prefixes_map , endpoints_void_dict
80-
81-
8237def get_prefixes_for_endpoint (
8338 endpoint_url : str , examples_file : str | None = None , prefixes_map : dict [str , str ] | None = None
8439) -> dict [str , str ]:
@@ -143,33 +98,6 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: str | None = None) ->
14398 Formatted as: dict[subject_cls][predicate] = list[object_cls/datatype]"""
14499 void_dict : SchemaDict = {}
145100 try :
146- # if void_file:
147- # g = rdflib.Graph()
148- # if void_file.startswith(("http://", "https://")):
149- # # Handle URL case
150- # with httpx.Client() as client:
151- # for attempt in range(10):
152- # # Retry a few times in case of HTTP errors, e.g. https://sparql.uniprot.org/.well-known/void/
153- # try:
154- # resp = client.get(void_file, headers={"Accept": "text/turtle"}, follow_redirects=True)
155- # resp.raise_for_status()
156- # if resp.text.strip() == "":
157- # raise ValueError(f"Empty response for VoID description from {void_file}")
158- # g.parse(data=resp.text, format="turtle")
159- # break
160- # except Exception as e:
161- # if attempt == 3:
162- # raise e
163- # time.sleep(1)
164- # continue
165- # else:
166- # # Handle local file case
167- # g.parse(void_file, format="turtle")
168- # results = g.query(GET_VOID_DESC)
169- # bindings = [{str(k): {"value": str(v)} for k, v in row.asdict().items()} for row in results]
170- # else:
171- # bindings = query_sparql(GET_VOID_DESC, endpoint_url)["results"]["bindings"]
172-
173101 for void_triple in query_sparql (GET_VOID_DESC , endpoint_url , use_file = void_file , check_service_desc = True )[
174102 "results"
175103 ]["bindings" ]:
@@ -192,12 +120,7 @@ def get_schema_for_endpoint(endpoint_url: str, void_file: str | None = None) ->
192120 return void_dict
193121
194122
195- # TODO: use SPARQLWrapper
196- # sparqlw = SPARQLWrapper(endpoint)
197- # sparqlw.setReturnFormat(JSON)
198- # sparqlw.setOnlyConneg(True)
199- # sparqlw.setQuery(query)
200- # res = sparqlw.query().convert()
123+ # Use https://github.com/lu-pl/sparqlx ?
201124def query_sparql (
202125 query : str ,
203126 endpoint_url : str ,
@@ -267,3 +190,70 @@ def query_sparql(
267190 if should_close :
268191 client .close ()
269192 return query_resp
193+
194+
195+ class EndpointsMetadataManager :
196+ """Lazy-loading manager for endpoints metadata."""
197+
198+ def __init__ (self , endpoints : list [SparqlEndpointLinks ], auto_init : bool = True ) -> None :
199+ self ._endpoints = endpoints
200+ self ._prefixes_map : dict [str , str ] = {}
201+ self ._void_dict : EndpointsSchemaDict = {}
202+ self ._initialized = False
203+ if auto_init :
204+ self ._ensure_loaded ()
205+
206+ def _ensure_loaded (self ) -> None :
207+ """Load metadata if not already loaded."""
208+ if self ._initialized :
209+ return
210+ # Try loading from file first
211+ try :
212+ with open (ENDPOINTS_METADATA_FILE ) as f :
213+ data = json .load (f )
214+ self ._prefixes_map = data .get ("prefixes_map" , {})
215+ self ._void_dict = data .get ("classes_schema" , {})
216+ if self ._prefixes_map and self ._void_dict :
217+ logger .info (
218+ f"💾 Loaded endpoints metadata from { ENDPOINTS_METADATA_FILE .resolve ()} "
219+ f"for { len (self ._void_dict )} endpoints"
220+ )
221+ return
222+ except Exception as e :
223+ logger .debug (f"Could not load metadata from { ENDPOINTS_METADATA_FILE } : { e } " )
224+
225+ logger .info (f"Fetching metadata for { len (self ._endpoints )} endpoints..." )
226+ for endpoint in self ._endpoints :
227+ self ._void_dict [endpoint ["endpoint_url" ]] = get_schema_for_endpoint (
228+ endpoint ["endpoint_url" ], endpoint .get ("void_file" )
229+ )
230+ logger .info (f"Fetching { endpoint ['endpoint_url' ]} metadata..." )
231+ self ._prefixes_map = get_prefixes_for_endpoint (
232+ endpoint ["endpoint_url" ], endpoint .get ("examples_file" ), self ._prefixes_map
233+ )
234+ # Cache to JSON file
235+ with open (ENDPOINTS_METADATA_FILE , "w" ) as f :
236+ json .dump ({"prefixes_map" : self ._prefixes_map , "classes_schema" : self ._void_dict }, f , indent = 2 )
237+ self ._initialized = True
238+ logger .info (f"💾 Cached endpoints metadata to { ENDPOINTS_METADATA_FILE .resolve ()} " )
239+
240+ @property
241+ def prefixes_map (self ) -> dict [str , str ]:
242+ """Get prefixes map, loading lazily if needed."""
243+ self ._ensure_loaded ()
244+ return self ._prefixes_map or {}
245+
246+ @property
247+ def void_dict (self ) -> "EndpointsSchemaDict" :
248+ """Get endpoints VoID schema dict, loading lazily if needed."""
249+ self ._ensure_loaded ()
250+ return self ._void_dict or {}
251+
252+ # def reset(self) -> None:
253+ # """Reset cached metadata (useful for re-initialization after init_vectordb)."""
254+ # self._prefixes_map = {}
255+ # self._void_dict = {}
256+
257+
258+ # Global instance, metadata loads lazily on first property access
259+ endpoints_metadata = EndpointsMetadataManager (settings .endpoints , settings .auto_init )
0 commit comments