|
3 | 3 | import httpx |
4 | 4 | import pandas as pd |
5 | 5 | from bs4 import BeautifulSoup |
| 6 | +from fastembed import TextEmbedding |
6 | 7 | from langchain_core.documents import Document |
7 | 8 | from markdownify import markdownify |
8 | | -from qdrant_client import models |
| 9 | +from qdrant_client import QdrantClient, models |
9 | 10 | from qdrant_client.http.models import Distance, VectorParams |
10 | 11 | from rdflib import RDF, Dataset, Namespace |
11 | 12 |
|
12 | 13 | from sparql_llm import SparqlExamplesLoader, SparqlInfoLoader, SparqlVoidShapesLoader |
13 | | -from sparql_llm.config import SparqlEndpointLinks, embedding_model, qdrant_client, settings |
| 14 | +from sparql_llm.config import SparqlEndpointLinks, settings |
14 | 15 | from sparql_llm.loaders.sparql_info_loader import GENERAL_INFO_DOC_TYPE |
15 | | -from sparql_llm.utils import endpoints_metadata |
| 16 | +from sparql_llm.utils import EndpointsMetadataManager |
16 | 17 |
|
17 | 18 | SCHEMA = Namespace("http://schema.org/") |
18 | 19 |
|
19 | 20 |
|
| 21 | +# Global instance, metadata loads lazily on first property access |
| 22 | +endpoints_metadata = EndpointsMetadataManager(settings.endpoints, settings.auto_init) |
| 23 | + |
| 24 | +# TODO: Getting `TypeError: cannot pickle '_thread.RLock' object` when doing `QdrantVectorStore.from_existing_collection(client=qdrant_client)` |
| 25 | +qdrant_client = ( |
| 26 | + QdrantClient(url=settings.vectordb_url, prefer_grpc=True, timeout=600) |
| 27 | + if settings.vectordb_url.startswith(("http", "https")) |
| 28 | + else QdrantClient(path=settings.vectordb_url) |
| 29 | +) |
| 30 | + |
| 31 | +embedding_model = TextEmbedding( |
| 32 | + settings.embedding_model, |
| 33 | + # providers=["CUDAExecutionProvider"], # Replace the fastembed dependency with fastembed-gpu to use your GPUs |
| 34 | +) |
| 35 | + |
| 36 | + |
20 | 37 | def load_schemaorg_description(endpoint: SparqlEndpointLinks) -> list[Document]: |
21 | 38 | """Extract datasets descriptions from the schema.org metadata in homepage of the endpoint""" |
22 | 39 | docs = [] |
|
0 commit comments