Skip to content

Commit 962f826

Browse files
committed
improve best match script
1 parent b7144dc commit 962f826

File tree

5 files changed

+500
-89
lines changed

5 files changed

+500
-89
lines changed

data/lab/matches.csv

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
raw_event_input,concept_id,concept_name
2-
Haemoglobin levels in blood,3005872,Hemoglobin [Presence] in Blood
3-
creatinine levels in blood,3051825,Creatinine [Mass/volume] in Blood
4-
o2 sat test,3016502,Oxygen saturation in Arterial blood
5-
ph blood,3010421,pH of Blood
6-
potassium levels in blood,21490733,Potassium [Mass/volume] in Blood
7-
na levels in blood,3000285,Sodium [Moles/volume] in Blood
8-
blood co2,3013290,Carbon dioxide [Partial pressure] in Blood
9-
wbc count,3002317,Cells Counted Total [#] in Blood
1+
raw_event_input,concept_id,concept_name,score
2+
Haemoglobin levels in blood,3005872,Hemoglobin [Presence] in Blood,0.866
3+
creatinine levels in blood,46235076,"Creatinine [Moles/volume] in Serum, Plasma or Blood",0.8512
4+
o2 sat test,3016502,Oxygen saturation in Arterial blood,0.6807
5+
ph blood,3010421,pH of Blood,0.9425
6+
potassium levels in blood,46235078,"Potassium [Moles/volume] in Serum, Plasma or Blood",0.8501
7+
na levels in blood,42528959,Sodium [Mass/volume] in Red Blood Cells,0.8452
8+
blood co2,3027946,Carbon dioxide [Partial pressure] in Arterial blood,0.8461
9+
wbc count,3003282,Leukocytes [#/volume] in Blood by Manual count,0.7607

src/omop_rag/best_match.py

Lines changed: 129 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,158 @@
1-
import pandas as pd
21
import json
3-
from transformers import pipeline
2+
import re
43

4+
import pandas as pd
5+
import requests
56

6-
def find_best_match(
7-
qa_pipeline,
8-
question,
9-
concepts_list
10-
):
11-
"""
12-
Finds the best matching concept from a list using a question-answer model.
7+
8+
def find_best_match(input_term: str, concepts_list: list) -> dict | None:
9+
"""Find the best matching concept using a request to an Ollama model.
1310
1411
Args:
15-
qa_pipeline: The pre-initialised question-answering pipeline.
16-
question (str): The question to ask (e.g., the raw event name).
17-
concepts_list (list): A list of dictionaries, where each dict
18-
represents a concept.
12+
input_term (str): The free-text input term to be matched.
13+
concepts_list (list): A list of candidate concept dictionaries,
14+
each expected to have 'id', 'name', and 'score'.
1915
2016
Returns:
21-
dict: The best matching concept dictionary (id and name), or None
22-
if no match is found.
17+
A dictionary representing the best matching concept chosen by the
18+
LLM, or the top vector search result as a fallback. Returns None
19+
if an API error occurs and there are no concepts to fall back to.
2320
"""
21+
concepts_str = json.dumps(concepts_list, indent=4)
22+
23+
prompt = f"""
24+
You are a highly skilled Clinical Terminologist and Medical Informatics
25+
expert. Your task is to match the 'FREE TEXT INPUT' to the most precise
26+
concept from the 'SIMILAR CONCEPTS' list. You must use the provided score
27+
as a guide but prioritize clinical accuracy.
28+
29+
**RULES FOR SELECTION:**
30+
1. Your answer MUST be one of the options from the 'SIMILAR CONCEPTS' list.
31+
2. Prioritize clinical accuracy. 'levels' or 'test' usually implies a
32+
quantitative measure like [Mass/volume] or [#/volume].
33+
3. Choose the most general correct option unless the input specifies
34+
otherwise (e.g., prefer 'Blood' over 'Arterial blood').
35+
36+
**FREE TEXT INPUT:**
37+
{input_term}
38+
39+
**SIMILAR CONCEPTS:**
40+
{concepts_str}
41+
42+
**TASK:**
43+
Return a single JSON object for the best matching concept. This object
44+
must include the id, name, and the original score. Do not add any other
45+
text or explanation.
46+
47+
**TARGET OUTPUT FORMAT:**
48+
{{
49+
"id": 1234567,
50+
"name": "Concept Name from the list",
51+
"score": 0.9876
52+
}}
53+
"""
54+
55+
ollama_api_url = "http://localhost:11434/api/generate"
56+
payload = {
57+
"model": "qwen3:1.7b",
58+
"prompt": prompt,
59+
"stream": False,
60+
"options": {"temperature": 0.0},
61+
}
2462

25-
context = ". ".join([c['name'] for c in concepts_list]) + "."
26-
27-
# The question-answering model will find the most likely answer span
28-
# in the context. We formulate the question to find the concept name.
29-
result = qa_pipeline(
30-
question=f"""
31-
You are a clinical and lab test specialist. Here is a set
32-
of 10 closely matched lab tests to this lab test: '{question}'.
33-
Select the single closest match?""",
34-
context=context
35-
)
36-
37-
# Find the concept that contains the predicted answer string
38-
for concept in concepts_list:
39-
if result['answer'].strip().lower() in concept['name'].strip().lower():
40-
return concept
63+
try:
64+
response = requests.post(ollama_api_url, json=payload, timeout=60)
65+
response.raise_for_status()
66+
response_data = response.json()
67+
model_output_str = response_data.get("response", "{}").strip()
68+
69+
# Extract the JSON object from the model's potentially noisy output.
70+
match = re.search(r"\{.*\}", model_output_str, re.DOTALL)
71+
if match:
72+
clean_json_str = match.group(0)
73+
best_match = json.loads(clean_json_str)
74+
return best_match
75+
76+
raise json.JSONDecodeError(
77+
"No JSON object found in model output.", model_output_str, 0
78+
)
4179

42-
# If a direct match isn't found, fall back to the highest-scoring concept
43-
# from the vector search
44-
print("""
45-
Warning: QA model's answer did not directly match a concept.
46-
Falling back to top vector search result.
47-
""")
48-
return concepts_list[0]
80+
except requests.exceptions.RequestException as e:
81+
print(f"Error communicating with Ollama API: {e}")
82+
return None
83+
except json.JSONDecodeError:
84+
print(
85+
f"Warning: Failed to decode JSON for input '{input_term}'."
86+
f"\nModel output was: {model_output_str}"
87+
"\nFalling back to top vector search result."
88+
)
89+
return concepts_list[0] if concepts_list else None
4990

5091

5192
def process_json_and_export_csv(
52-
input_json_path,
53-
output_csv_path,
54-
limit=5
93+
input_json_path: str, output_csv_path: str, limit: int | None = None
5594
):
56-
"""
57-
Processes the JSON output from the vector search, uses a QA model to find
58-
the best match for each input event, and exports the results to a CSV file.
95+
"""Process vector search results and export LLM-validated matches.
5996
6097
Args:
61-
input_json_path (str): The path to the input JSON file.
62-
output_csv_path (str): The path to the output CSV file.
63-
limit (int): The maximum number of rows to process.
98+
input_json_path (str): Path to the input JSON file containing
99+
search terms and their similar concepts.
100+
output_csv_path (str): Path where the output CSV file will be
101+
saved.
102+
limit (int | None, optional): The maximum number of items to
103+
process from the input file. Defaults to None (no limit).
64104
"""
65105
try:
66-
with open(input_json_path, 'r') as f:
106+
with open(input_json_path, "r", encoding="utf-8-sig") as f:
67107
data = json.load(f)
68108
except FileNotFoundError:
69-
print(f"Error: The input JSON file '{input_json_path}' was not found.")
109+
print(f"Error: Input file not found at '{input_json_path}'.")
110+
return
111+
except json.JSONDecodeError as e:
112+
print(f"Error decoding JSON from '{input_json_path}': {e}")
70113
return
71114

72-
# Initialise the question-answering pipeline once
73-
print("Loading the deepset/roberta-base-squad2 model...")
74-
qa_pipeline = pipeline(
75-
"question-answering",
76-
model="deepset/roberta-base-squad2",
77-
tokenizer="deepset/roberta-base-squad2"
78-
)
79-
print("Model loaded.")
115+
print("Starting processing with Ollama model...")
80116

81117
results_for_csv = []
118+
items_to_process = data[:limit] if limit is not None else data
82119

83-
# Use slicing to limit the processing to the first 'limit' items
84-
for item in data[:limit]:
85-
raw_event_input = item['input']
86-
similar_concepts = item['similar_concepts']
120+
for i, item in enumerate(items_to_process, 1):
121+
raw_event_input = item["input"]
122+
similar_concepts = item["similar_concepts"]
123+
124+
print(
125+
f"Processing item {i}/{len(items_to_process)}: "
126+
f"'{raw_event_input}'..."
127+
)
87128

88129
if not similar_concepts:
89130
print(f"No concepts found for '{raw_event_input}'. Skipping.")
90131
continue
91132

92-
# Use the QA model to find the best match from the list of concepts
93-
best_match = find_best_match(
94-
qa_pipeline,
95-
raw_event_input,
96-
similar_concepts
97-
)
98-
99-
if best_match:
100-
results_for_csv.append({
101-
'raw_event_input': raw_event_input,
102-
'concept_id': best_match['id'],
103-
'concept_name': best_match['name']
104-
})
105-
print(f"Processed '{raw_event_input}': Best match is ID {best_match['id']} ('{best_match['name']}').") # noqa: E501
133+
best_match = find_best_match(raw_event_input, similar_concepts)
134+
135+
if best_match and all(k in best_match for k in [
136+
"id",
137+
"name",
138+
"score"
139+
]):
140+
results_for_csv.append(
141+
{
142+
"raw_event_input": raw_event_input,
143+
"concept_id": best_match["id"],
144+
"concept_name": best_match["name"],
145+
"score": best_match["score"],
146+
}
147+
)
148+
print(
149+
f" -> Match: ID {best_match['id']} "
150+
f"('{best_match['name']}') Score: {best_match['score']:.4f}"
151+
)
152+
else:
153+
print(f" -> Could not determine a definitive match for "
154+
f"'{raw_event_input}'.")
106155

107-
# Export to CSV
108156
if results_for_csv:
109157
df = pd.DataFrame(results_for_csv)
110158
df.to_csv(output_csv_path, index=False)
@@ -113,8 +161,9 @@ def process_json_and_export_csv(
113161
print("\nNo results to save.")
114162

115163

116-
if __name__ == '__main__':
117-
input_json_file = 'similar_results.json'
118-
output_csv_file = 'matches.csv'
164+
if __name__ == "__main__":
165+
# Define your input and output file paths
166+
input_json_file = "similar_results.json"
167+
output_csv_file = "matches.csv"
119168

120-
process_json_and_export_csv(input_json_file, output_csv_file, limit=10000)
169+
process_json_and_export_csv(input_json_file, output_csv_file)

src/omop_rag/prompt.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
input_text = ""
2+
input_json = {}
3+
4+
prompt = f"""
5+
You are a highly skilled Clinical Terminologist and Medical Informatics expert. Your task is to match a free-text lab test input to the most precise and representative concept from a list of similar candidates, prioritizing clinical accuracy over the provided score alone.
6+
7+
**RULES FOR SELECTION:**
8+
1. **Direct Synonymy:** The concept must use the correct medical synonym (e.g., 'Leukocytes' for 'WBC').
9+
2. **Precision:** The concept must accurately reflect the measurement (e.g., 'count' maps to '[volume]').
10+
11+
**FREE TEXT INPUT:**
12+
creatinine levels blood
13+
14+
**SIMILAR CONCEPTS (including ID, Name, and raw Score):**
15+
16+
{
17+
"id": 3051825,
18+
"name": "Creatinine [Mass/volume] in Blood",
19+
"score": 0.8939
20+
},
21+
{
22+
"id": 40762887,
23+
"name": "Creatinine [Moles/volume] in Blood",
24+
"score": 0.8774
25+
},
26+
{
27+
"id": 3007760,
28+
"name": "Creatinine [Mass/volume] in Arterial blood",
29+
"score": 0.8654
30+
}
31+
32+
**TASK:**
33+
1. State the **Closest Matched Concept** (ID and Name) ONLY, no explaination or formatting, just the JSON response.
34+
35+
**TARGET OUTPUT FORMAT:**
36+
37+
{{
38+
'input_term': 'wbc count',
39+
'id': 3010813,
40+
'name': 'Leukocytes [volume] in Blood',
41+
}}
42+
43+
"""
44+
45+
print(prompt)

0 commit comments

Comments
 (0)