Skip to content

Commit 6fae77c

Browse files
committed
test: improve text2sparql indexing
1 parent 1a98512 commit 6fae77c

File tree

3 files changed

+22
-28
lines changed

3 files changed

+22
-28
lines changed

compose.text2sparql.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,5 +77,6 @@ services:
7777
- DBPEDIA_URL=http://virtuoso-dbpedia:8890/sparql
7878
- CORPORATE_URL=http://virtuoso-corporate:8890/sparql
7979
- VECTORDB_URL=http://vectordb:6334/
80+
- BENCH_MODEL=openrouter/openai/gpt-oss-120b
8081
entrypoint: uv run
8182
command: uvicorn tests.text2sparql.api:app --host 0.0.0.0 --port 8765

tests/text2sparql/api.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,20 @@ def get_dataset_id_from_iri(dataset_iri: str) -> str:
2929
"https://text2sparql.aksw.org/2025/corporate/": os.getenv("CORPORATE_URL", "http://virtuoso-corporate:8890/sparql"),
3030
}
3131

32-
MODEL = "openrouter/openai/gpt-oss-120b"
33-
DOCKER_ENDPOINT_URL = "http://text2sparql-virtuoso:8890/sparql/"
34-
DOCKER_VECTORDB_URL = "http://vectordb:6334"
35-
ENDPOINT_URL = "http://localhost:8890/sparql/"
36-
37-
# def normalize_docker_to_localhost(url: str) -> str:
38-
32+
MODEL = os.getenv("BENCH_MODEL", "openrouter/openai/gpt-oss-120b")
3933

4034
SCHEMAS = {}
4135
for dataset_iri in DATASETS_ENDPOINTS.keys():
42-
with open(
43-
os.path.join("/", "data", f"{get_dataset_id_from_iri(dataset_iri)}_schema.json"),
44-
encoding="utf-8",
45-
) as f:
46-
SCHEMAS[dataset_iri] = json.load(f)
47-
# SCHEMAS[dataset][DOCKER_ENDPOINT_URL] = SCHEMAS[dataset].pop(ENDPOINT_URL)
48-
# docker_url =
49-
SCHEMAS[dataset_iri][DOCKER_ENDPOINT_URL] = SCHEMAS[dataset_iri].pop(ENDPOINT_URL)
36+
try:
37+
with open(
38+
os.path.join("/", "data", f"{get_dataset_id_from_iri(dataset_iri)}_schema.json"),
39+
encoding="utf-8",
40+
) as f:
41+
SCHEMAS[dataset_iri] = json.load(f)
42+
except FileNotFoundError:
43+
print(
44+
f"Schema file for dataset {dataset_iri} not found. Please run the indexing script to generate the schema files."
45+
)
5046

5147
RAG_PROMPT = """
5248
@@ -103,10 +99,11 @@ def get_dataset_id_from_iri(dataset_iri: str) -> str:
10399
async def get_answer(question: str, dataset: str):
104100
if dataset not in DATASETS_ENDPOINTS:
105101
raise fastapi.HTTPException(404, "Unknown dataset ...")
102+
endpoint_url = DATASETS_ENDPOINTS[dataset]
106103
# Retrieve relevant queries
107104
question_embeddings = next(iter(embedding_model.embed([question])))
108105
retrieved_queries = vectordb.query_points(
109-
collection_name=f"text2sparql-{dataset.split('/')[-2]}",
106+
collection_name=f"text2sparql-{get_dataset_id_from_iri(dataset)}",
110107
query=question_embeddings,
111108
limit=settings.default_number_of_retrieved_docs,
112109
query_filter=Filter(
@@ -121,7 +118,7 @@ async def get_answer(question: str, dataset: str):
121118

122119
# Retrieve relevant classes
123120
retrieved_classes = vectordb.query_points(
124-
collection_name=f"text2sparql-{dataset.split('/')[-2]}",
121+
collection_name=f"text2sparql-{get_dataset_id_from_iri(dataset)}",
125122
query=question_embeddings,
126123
limit=settings.default_number_of_retrieved_docs,
127124
query_filter=Filter(
@@ -163,14 +160,13 @@ async def get_answer(question: str, dataset: str):
163160
chat_resp_md = response.model_dump()["content"]
164161
generated_sparqls = extract_sparql_queries(chat_resp_md)
165162
generated_sparql = generated_sparqls[-1]["query"].strip()
166-
generated_sparql = generated_sparql.replace(ENDPOINT_URL, DOCKER_ENDPOINT_URL)
167163
# print(f"Generated SPARQL query: {generated_sparql}")
168164
# print(f"Response message: {resp_msg}")
169165
except Exception:
170166
resp_msg += "## No SPARQL query could be extracted from the model response. Please provide a valid SPARQL query based on the provided information and try again.\n"
171167
if generated_sparql != "":
172168
try:
173-
res = query_sparql(generated_sparql, DOCKER_ENDPOINT_URL)
169+
res = query_sparql(generated_sparql, endpoint_url)
174170
if res.get("results", {}).get("bindings"):
175171
# Successfully generated a query with results
176172
if num_of_tries > 0:
@@ -183,7 +179,7 @@ async def get_answer(question: str, dataset: str):
183179

184180
except Exception as e:
185181
validation_output = validate_sparql(
186-
query=generated_sparql, endpoint_url=DOCKER_ENDPOINT_URL, endpoints_void_dict=SCHEMAS[dataset]
182+
query=generated_sparql, endpoint_url=endpoint_url, endpoints_void_dict=SCHEMAS[dataset]
187183
)
188184
if validation_output["errors"]:
189185
error_str = "- " + "\n- ".join(validation_output["errors"])

tests/text2sparql/index.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,16 @@
1515

1616
def init_vectordb(
1717
endpoint_url: str,
18-
graph: str,
18+
dataset_iri: str,
1919
limit_schema: dict[str, float],
2020
max_workers: int,
2121
force_recompute: bool,
22-
schema_path: str,
2322
) -> None:
2423
"""Initialize the vectordb with example queries and schema information from the SPARQL endpoints"""
2524
docs: list[Document] = []
2625

2726
# Index example queries
28-
examples = ["Generated-CK"] if "corporate" in graph else ["QALD-9+", "LC-QuAD"]
27+
examples = ["Generated-CK"] if "corporate" in dataset_iri else ["QALD-9+", "LC-QuAD"]
2928

3029
queries = pd.read_csv(QUERIES_FILE)
3130
queries = queries[queries["dataset"].isin(examples)].reset_index(drop=True)
@@ -51,11 +50,10 @@ def init_vectordb(
5150
start_time = time.time()
5251
schema = EndpointSchema(
5352
endpoint_url=endpoint_url,
54-
# graph=graph,
5553
limit_schema=limit_schema,
5654
max_workers=max_workers,
5755
force_recompute=force_recompute,
58-
schema_path=schema_path,
56+
schema_path=os.path.join("data", f"{get_dataset_id_from_iri(dataset_iri)}_schema.json"),
5957
).get_schema()
6058

6159
docs += schema.apply(
@@ -84,7 +82,7 @@ def init_vectordb(
8482

8583
embeddings = list(embedding_model.embed([d.page_content for d in docs]))
8684

87-
collection_name = f"text2sparql-{graph.split('/')[-2]}"
85+
collection_name = f"text2sparql-{get_dataset_id_from_iri(dataset_iri)}"
8886
# Ensure collection exists before upserting
8987
if not qdrant_client.collection_exists(collection_name):
9088
qdrant_client.create_collection(
@@ -119,13 +117,12 @@ def init_vectordb(
119117
# Init vectordb for the specified dataset
120118
init_vectordb(
121119
endpoint_url=DATASETS_ENDPOINTS[dataset_iri],
122-
graph=dataset_iri,
120+
dataset_iri=dataset_iri,
123121
limit_schema={
124122
"top_classes_percentile": 0,
125123
"top_n_predicates": 20,
126124
"top_n_ranges": 1,
127125
},
128126
max_workers=4,
129127
force_recompute=True,
130-
schema_path=os.path.join("data", f"{get_dataset_id_from_iri(dataset_iri)}_schema.json"),
131128
)

0 commit comments

Comments
 (0)