Skip to content

Commit de6109f

Browse files
committed
refactor: move qdrant_client and embedding_model instantiation to indexing script
1 parent da03963 commit de6109f

File tree

9 files changed

+34
-33
lines changed

9 files changed

+34
-33
lines changed

src/sparql_llm/agent/nodes/retrieval_docs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
from sparql_llm.agent.state import State, StepOutput
88
from sparql_llm.agent.utils import get_msg_text
9-
from sparql_llm.config import Configuration, embedding_model, qdrant_client, settings
9+
from sparql_llm.config import Configuration, settings
10+
from sparql_llm.indexing.index_resources import embedding_model, qdrant_client
1011

1112
# TODO: use grouping? https://qdrant.tech/documentation/concepts/search/#grouping-api
1213
# Which tools can I use for enrichment analysis?

src/sparql_llm/agent/nodes/retrieval_entities.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
from qdrant_client import models
88

99
from sparql_llm.agent.state import State, StepOutput
10-
from sparql_llm.config import Configuration, embedding_model, qdrant_client, settings
10+
from sparql_llm.config import Configuration, settings
11+
from sparql_llm.indexing.index_resources import embedding_model, qdrant_client
1112

1213
# NOTE: experimental, not used in production
1314

src/sparql_llm/agent/nodes/validation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from sparql_llm.agent.prompts import FIX_QUERY_PROMPT
1111
from sparql_llm.agent.state import State, StepOutput
1212
from sparql_llm.config import Configuration, settings
13-
from sparql_llm.utils import endpoints_metadata, query_sparql
13+
from sparql_llm.indexing.index_resources import endpoints_metadata
14+
from sparql_llm.utils import query_sparql
1415
from sparql_llm.validate_sparql import validate_sparql_in_msg
1516

1617

src/sparql_llm/config.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
from pathlib import Path
99
from typing import Annotated, Any, Required, TypeVar
1010

11-
from fastembed import TextEmbedding
1211
from langchain_core.runnables import RunnableConfig, ensure_config
1312
from pydantic_settings import BaseSettings, SettingsConfigDict
14-
from qdrant_client import QdrantClient
1513
from typing_extensions import TypedDict
1614

1715
from sparql_llm.agent import prompts
@@ -224,20 +222,6 @@ def from_file(cls, filepath: str) -> Settings:
224222
settings = Settings.from_file(settings_filepath) if settings_filepath else Settings()
225223
# logger.info(f"📂 Using SETTINGS file: {settings_filepath}")
226224

227-
# settings = Settings()
228-
229-
# TODO: Getting `TypeError: cannot pickle '_thread.RLock' object` when doing `QdrantVectorStore.from_existing_collection(client=qdrant_client)`
230-
qdrant_client = (
231-
QdrantClient(url=settings.vectordb_url, prefer_grpc=True, timeout=600)
232-
if settings.vectordb_url.startswith(("http", "https"))
233-
else QdrantClient(path=settings.vectordb_url)
234-
)
235-
236-
embedding_model = TextEmbedding(
237-
settings.embedding_model,
238-
# providers=["CUDAExecutionProvider"], # Replace the fastembed dependency with fastembed-gpu to use your GPUs
239-
)
240-
241225

242226
# Configuration defined at runtime
243227
@dataclass(kw_only=True)

src/sparql_llm/indexing/index_entities.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from langchain_core.documents import Document
77
from qdrant_client import models
88

9-
from sparql_llm.config import qdrant_client, settings
9+
from sparql_llm.config import settings
10+
from sparql_llm.indexing.index_resources import qdrant_client
1011
from sparql_llm.utils import query_sparql
1112

1213
# NOTE: Run the script to extract entities from endpoints and generate embeddings for them (long):

src/sparql_llm/indexing/index_resources.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,37 @@
33
import httpx
44
import pandas as pd
55
from bs4 import BeautifulSoup
6+
from fastembed import TextEmbedding
67
from langchain_core.documents import Document
78
from markdownify import markdownify
8-
from qdrant_client import models
9+
from qdrant_client import QdrantClient, models
910
from qdrant_client.http.models import Distance, VectorParams
1011
from rdflib import RDF, Dataset, Namespace
1112

1213
from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader
13-
from sparql_llm.config import SparqlEndpointLinks, embedding_model, qdrant_client, settings
14+
from sparql_llm.config import SparqlEndpointLinks, settings
1415
from sparql_llm.loaders.sparql_info_loader import GENERAL_INFO_DOC_TYPE
15-
from sparql_llm.utils import endpoints_metadata
16+
from sparql_llm.utils import EndpointsMetadataManager
1617

1718
SCHEMA = Namespace("http://schema.org/")
1819

1920

21+
# Global instance, metadata loads lazily on first property access
22+
endpoints_metadata = EndpointsMetadataManager(settings.endpoints, settings.auto_init)
23+
24+
# TODO: Getting `TypeError: cannot pickle '_thread.RLock' object` when doing `QdrantVectorStore.from_existing_collection(client=qdrant_client)`
25+
qdrant_client = (
26+
QdrantClient(url=settings.vectordb_url, prefer_grpc=True, timeout=600)
27+
if settings.vectordb_url.startswith(("http", "https"))
28+
else QdrantClient(path=settings.vectordb_url)
29+
)
30+
31+
embedding_model = TextEmbedding(
32+
settings.embedding_model,
33+
# providers=["CUDAExecutionProvider"], # Replace the fastembed dependency with fastembed-gpu to use your GPUs
34+
)
35+
36+
2037
def load_schemaorg_description(endpoint: SparqlEndpointLinks) -> list[Document]:
2138
"""Extract datasets descriptions from the schema.org metadata in homepage of the endpoint"""
2239
docs = []

src/sparql_llm/mcp_server.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
from mcp.server.fastmcp import FastMCP
55
from qdrant_client.models import FieldCondition, Filter, MatchValue, ScoredPoint
66

7-
from sparql_llm.config import embedding_model, qdrant_client, settings
8-
from sparql_llm.indexing.index_resources import init_vectordb
9-
from sparql_llm.utils import endpoints_metadata, logger, query_sparql
7+
from sparql_llm.config import settings
8+
from sparql_llm.indexing.index_resources import embedding_model, endpoints_metadata, init_vectordb, qdrant_client
9+
from sparql_llm.utils import logger, query_sparql
1010
from sparql_llm.validate_sparql import validate_sparql
1111

12-
logger
1312
# What are the rat orthologs of the human TP53?
1413
# TODO: MCP integrated https://github.com/modelcontextprotocol/python-sdk/pull/1007
1514

src/sparql_llm/utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import httpx
88
import rdflib
99

10-
from sparql_llm.config import SparqlEndpointLinks, settings
10+
from sparql_llm.config import SparqlEndpointLinks
1111

1212
# Disable logger in your code with logging.getLogger("sparql_llm").setLevel(logging.WARNING)
1313
logger = logging.getLogger("sparql_llm")
@@ -255,7 +255,3 @@ def void_dict(self) -> "EndpointsSchemaDict":
255255
# """Reset cached metadata (useful for re-initialization after init_vectordb)."""
256256
# self._prefixes_map = {}
257257
# self._void_dict = {}
258-
259-
260-
# Global instance, metadata loads lazily on first property access
261-
endpoints_metadata = EndpointsMetadataManager(settings.endpoints, settings.auto_init)

tests/benchmark_biodata.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222

2323
# from sklearn.model_selection import KFold
2424
from sparql_llm import SparqlExamplesLoader, SparqlVoidShapesLoader
25-
from sparql_llm.config import embedding_model, qdrant_client, settings
25+
from sparql_llm.config import settings
26+
from sparql_llm.indexing.index_resources import embedding_model, qdrant_client
2627
from sparql_llm.utils import EndpointsMetadataManager, query_sparql
2728
from sparql_llm.validate_sparql import extract_sparql_queries
2829

0 commit comments

Comments
 (0)