Skip to content

Commit 58cbca2

Browse files
committed
chore: use intfloat/multilingual-e5-large as default embedding model
1 parent f57e63f commit 58cbca2

File tree

12 files changed

+25
-23
lines changed

12 files changed

+25
-23
lines changed

compose.override.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ services:
1010
- 8000:8000
1111
environment:
1212
- DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.1
13+
# - AUTO_INIT=false
1314
# - USE_TOOLS=true
1415
# - FORCE_REINDEX=true
1516
# - DEFAULT_LLM_MODEL=openrouter/openai/gpt-5.2
1617
# - DEFAULT_LLM_MODEL=openrouter/mistralai/mistral-large
1718
# - DEFAULT_LLM_MODEL=openrouter/anthropic/claude-sonnet-4.5
18-
entrypoint: ["uv", "run", "uvicorn", "src.sparql_llm.agent.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload", "--log-config", "logging.yml"]
19+
entrypoint: ["uv", "run", "uvicorn", "src.sparql_llm.agent.main:app", "--host", "0.0.0.0", "--port", "8000", "--log-config", "logging.yml"]
20+
# entrypoint: ["uv", "run", "uvicorn", "src.sparql_llm.agent.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload", "--log-config", "logging.yml"]

src/sparql_llm/agent/nodes/validation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ async def validate_output(state: State, config: RunnableConfig) -> dict[str, Any
3131
last_msg = re.sub(r"<think>.*?</think>", "", str(state.messages[-1].content), flags=re.DOTALL)
3232
validation_steps: list[StepOutput] = []
3333
recall_messages: list[HumanMessage] = []
34-
3534
validation_outputs = validate_sparql_in_msg(last_msg, endpoints_metadata.prefixes_map, endpoints_metadata.void_dict)
3635
for validation_output in validation_outputs:
3736
if validation_output["fixed_query"]:

src/sparql_llm/config.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,10 @@ class Settings(BaseSettings):
131131
# vectordb_url: str = "http://vectordb:6334/"
132132
vectordb_url: str = "data/vectordb"
133133
# https://qdrant.github.io/fastembed/examples/Supported_Models/#supported-text-embedding-models
134-
embedding_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
135-
embedding_dimensions: int = 768
134+
# embedding_model: str = "BAAI/bge-small-en-v1.5"
135+
# embedding_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
136+
embedding_model: str = "intfloat/multilingual-e5-large"
137+
136138
force_index: bool = False
137139
# Automatically initialize the vector store client, should be False when deploying in prod with multiple workers
138140
auto_init: bool = True

src/sparql_llm/indexing/index_entities.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,23 +257,23 @@ def generate_embeddings_for_entities(gpu: bool = False) -> None:
257257
if qdrant_client.collection_exists(settings.entities_collection_name):
258258
qdrant_client.delete_collection(settings.entities_collection_name)
259259

260+
# Process documents in batches to handle millions of entities efficiently
261+
embedding_model = TextEmbedding(settings.embedding_model, providers=["CUDAExecutionProvider"] if gpu else None)
262+
sparse_embedding_model = SparseTextEmbedding(settings.sparse_embedding_model)
263+
260264
# Initialize collection in Qdrant vectordb with hybrid retrieval mode (dense and sparse vectors)
261265
# With indexes loaded on disk to avoid OOM errors when indexing large collections
262266
qdrant_client.create_collection(
263267
collection_name=settings.entities_collection_name,
264268
vectors_config=models.VectorParams(
265-
size=settings.embedding_dimensions,
269+
size=embedding_model.embedding_size,
266270
distance=models.Distance.COSINE,
267271
on_disk=True,
268272
),
269273
hnsw_config=models.HnswConfigDiff(on_disk=True),
270274
sparse_vectors_config={"sparse": models.SparseVectorParams()},
271275
)
272276

273-
# Process documents in batches to handle millions of entities efficiently
274-
embedding_model = TextEmbedding(settings.embedding_model, providers=["CUDAExecutionProvider"] if gpu else None)
275-
sparse_embedding_model = SparseTextEmbedding(settings.sparse_embedding_model)
276-
277277
batch_size = 1000 # Adjust based on your GPU memory and document size
278278
total_docs = len(docs)
279279

src/sparql_llm/indexing/index_resources.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,6 @@ def load_expasy_resources_infos(file: str = "expasy_resources_metadata.csv") ->
162162
def init_vectordb() -> None:
163163
"""Initialize the vectordb with example queries and ontology descriptions from the SPARQL endpoints."""
164164
docs: list[Document] = []
165-
endpoints_metadata._ensure_loaded()
166165

167166
# Gets documents from the SPARQL endpoints
168167
for endpoint in settings.endpoints:
@@ -227,7 +226,7 @@ def init_vectordb() -> None:
227226
qdrant_client.delete_collection(settings.docs_collection_name)
228227
qdrant_client.create_collection(
229228
collection_name=settings.docs_collection_name,
230-
vectors_config=VectorParams(size=settings.embedding_dimensions, distance=Distance.COSINE),
229+
vectors_config=VectorParams(size=embedding_model.embedding_size, distance=Distance.COSINE),
231230
)
232231

233232
# Generate embeddings with the fastembed `TextEmbedding` instance and upload directly to Qdrant

src/sparql_llm/mcp_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from sparql_llm.utils import endpoints_metadata, logger, query_sparql
1010
from sparql_llm.validate_sparql import validate_sparql
1111

12+
logger
1213
# What are the rat orthologs of the human TP53?
1314
# TODO: MCP integrated https://github.com/modelcontextprotocol/python-sdk/pull/1007
1415

src/sparql_llm/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ def __init__(self, endpoints: list[SparqlEndpointLinks], auto_init: bool = True)
201201
self._void_dict: EndpointsSchemaDict = {}
202202
self._initialized = False
203203
if auto_init:
204+
logger.info("Auto-initializing endpoints metadata...")
204205
self._ensure_loaded()
205206

206207
def _ensure_loaded(self) -> None:

tests/benchmark_biodata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def main() -> None:
305305
qdrant_client.delete_collection(vector_collection)
306306
qdrant_client.create_collection(
307307
collection_name=vector_collection,
308-
vectors_config=VectorParams(size=settings.embedding_dimensions, distance=Distance.COSINE),
308+
vectors_config=VectorParams(size=embedding_model.embedding_size, distance=Distance.COSINE),
309309
)
310310

311311
# Generate embeddings and add documents to vectordb

tutorial/app.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,14 @@ def load_chat_model(model: str) -> BaseChatModel:
9797
"BAAI/bge-small-en-v1.5",
9898
# providers=["CUDAExecutionProvider"], # Replace the fastembed dependency with fastembed-gpu to use your GPUs
9999
)
100-
embedding_dimensions = 384
101100

102101
collection_name = "sparql-docs"
103102
vectordb = QdrantClient(path="data/vectordb")
104103
# vectordb = QdrantClient(location=":memory:")
105104
# vectordb = QdrantClient(host="localhost", prefer_grpc=True)
106105

107106

108-
def index_endpoints():
107+
def index_endpoints() -> None:
109108
"""Index SPARQL endpoints metadata in the vector database."""
110109
docs: list[Document] = []
111110
for endpoint in endpoints:
@@ -124,7 +123,7 @@ def index_endpoints():
124123
vectordb.delete_collection(collection_name)
125124
vectordb.create_collection(
126125
collection_name=collection_name,
127-
vectors_config=VectorParams(size=embedding_dimensions, distance=Distance.COSINE),
126+
vectors_config=VectorParams(size=embedding_model.embedding_size, distance=Distance.COSINE),
128127
)
129128

130129
embeddings = embedding_model.embed([q.page_content for q in docs])
@@ -223,7 +222,7 @@ def execute_query(last_msg: str) -> list[dict[str, str]]:
223222

224223

225224
@cl.on_message
226-
async def on_message(msg: cl.Message):
225+
async def on_message(msg: cl.Message) -> None:
227226
"""Main function to handle when user send a message to the assistant."""
228227
retrieved_docs = retrieve_docs(msg.content)
229228
formatted_docs = "\n".join(format_doc(doc) for doc in retrieved_docs)
@@ -292,7 +291,7 @@ async def set_starters():
292291
# uv run --env-file .env app.py
293292

294293

295-
async def main():
294+
async def main() -> None:
296295
question = "What are the rat orthologs of human TP53?"
297296

298297
logging.info("\n\n###### 🙉 Without context retrieval ########\n\n")

tutorial/index.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
"BAAI/bge-small-en-v1.5",
2727
# providers=["CUDAExecutionProvider"], # Replace the fastembed dependency with fastembed-gpu to use your GPUs
2828
)
29-
embedding_dimensions = 384
3029

3130
vectordb = QdrantClient(host="localhost", prefer_grpc=True)
3231
collection_name = "sparql-docs"
@@ -53,7 +52,7 @@ def index_endpoints() -> None:
5352
vectordb.delete_collection(collection_name)
5453
vectordb.create_collection(
5554
collection_name=collection_name,
56-
vectors_config=VectorParams(size=embedding_dimensions, distance=Distance.COSINE),
55+
vectors_config=VectorParams(size=embedding_model.embedding_size, distance=Distance.COSINE),
5756
)
5857

5958
embeddings = embedding_model.embed([q.page_content for q in docs])

0 commit comments

Comments
 (0)