1- import pandas as pd
21import json
3- from transformers import pipeline
2+ import re
43
4+ import pandas as pd
5+ import requests
56
6- def find_best_match (
7- qa_pipeline ,
8- question ,
9- concepts_list
10- ):
11- """
12- Finds the best matching concept from a list using a question-answer model.
7+
8+ def find_best_match (input_term : str , concepts_list : list ) -> dict | None :
9+ """Find the best matching concept using a request to an Ollama model.
1310
1411 Args:
15- qa_pipeline: The pre-initialised question-answering pipeline.
16- question (str): The question to ask (e.g., the raw event name).
17- concepts_list (list): A list of dictionaries, where each dict
18- represents a concept.
12+ input_term (str): The free-text input term to be matched.
13+ concepts_list (list): A list of candidate concept dictionaries,
14+ each expected to have 'id', 'name', and 'score'.
1915
2016 Returns:
21- dict: The best matching concept dictionary (id and name), or None
22- if no match is found.
17+ A dictionary representing the best matching concept chosen by the
18+ LLM, or the top vector search result as a fallback. Returns None
19+ if an API error occurs and there are no concepts to fall back to.
2320 """
21+ concepts_str = json .dumps (concepts_list , indent = 4 )
22+
23+ prompt = f"""
24+ You are a highly skilled Clinical Terminologist and Medical Informatics
25+ expert. Your task is to match the 'FREE TEXT INPUT' to the most precise
26+ concept from the 'SIMILAR CONCEPTS' list. You must use the provided score
27+ as a guide but prioritize clinical accuracy.
28+
29+ **RULES FOR SELECTION:**
30+ 1. Your answer MUST be one of the options from the 'SIMILAR CONCEPTS' list.
31+ 2. Prioritize clinical accuracy. 'levels' or 'test' usually implies a
32+ quantitative measure like [Mass/volume] or [#/volume].
33+ 3. Choose the most general correct option unless the input specifies
34+ otherwise (e.g., prefer 'Blood' over 'Arterial blood').
35+
36+ **FREE TEXT INPUT:**
37+ { input_term }
38+
39+ **SIMILAR CONCEPTS:**
40+ { concepts_str }
41+
42+ **TASK:**
43+ Return a single JSON object for the best matching concept. This object
44+ must include the id, name, and the original score. Do not add any other
45+ text or explanation.
46+
47+ **TARGET OUTPUT FORMAT:**
48+ {{
49+ "id": 1234567,
50+ "name": "Concept Name from the list",
51+ "score": 0.9876
52+ }}
53+ """
54+
55+ ollama_api_url = "http://localhost:11434/api/generate"
56+ payload = {
57+ "model" : "qwen3:1.7b" ,
58+ "prompt" : prompt ,
59+ "stream" : False ,
60+ "options" : {"temperature" : 0.0 },
61+ }
2462
25- context = ". " . join ([ c [ 'name' ] for c in concepts_list ]) + "."
26-
27- # The question-answering model will find the most likely answer span
28- # in the context. We formulate the question to find the concept name.
29- result = qa_pipeline (
30- question = f"""
31- You are a clinical and lab test specialist. Here is a set
32- of 10 closely matched lab tests to this lab test: ' { question } '.
33- Select the single closest match?""" ,
34- context = context
35- )
36-
37- # Find the concept that contains the predicted answer string
38- for concept in concepts_list :
39- if result [ 'answer' ]. strip (). lower () in concept [ 'name' ]. strip (). lower ():
40- return concept
63+ try :
64+ response = requests . post ( ollama_api_url , json = payload , timeout = 60 )
65+ response . raise_for_status ()
66+ response_data = response . json ()
67+ model_output_str = response_data . get ( "response" , "{}" ). strip ()
68+
69+ # Extract the JSON object from the model's potentially noisy output.
70+ match = re . search ( r"\{.*\}" , model_output_str , re . DOTALL )
71+ if match :
72+ clean_json_str = match . group ( 0 )
73+ best_match = json . loads ( clean_json_str )
74+ return best_match
75+
76+ raise json . JSONDecodeError (
77+ "No JSON object found in model output." , model_output_str , 0
78+ )
4179
42- # If a direct match isn't found, fall back to the highest-scoring concept
43- # from the vector search
44- print ("""
45- Warning: QA model's answer did not directly match a concept.
46- Falling back to top vector search result.
47- """ )
48- return concepts_list [0 ]
80+ except requests .exceptions .RequestException as e :
81+ print (f"Error communicating with Ollama API: { e } " )
82+ return None
83+ except json .JSONDecodeError :
84+ print (
85+ f"Warning: Failed to decode JSON for input '{ input_term } '."
86+ f"\n Model output was: { model_output_str } "
87+ "\n Falling back to top vector search result."
88+ )
89+ return concepts_list [0 ] if concepts_list else None
4990
5091
5192def process_json_and_export_csv (
52- input_json_path ,
53- output_csv_path ,
54- limit = 5
93+ input_json_path : str , output_csv_path : str , limit : int | None = None
5594):
56- """
57- Processes the JSON output from the vector search, uses a QA model to find
58- the best match for each input event, and exports the results to a CSV file.
95+ """Process vector search results and export LLM-validated matches.
5996
6097 Args:
61- input_json_path (str): The path to the input JSON file.
62- output_csv_path (str): The path to the output CSV file.
63- limit (int): The maximum number of rows to process.
98+ input_json_path (str): Path to the input JSON file containing
99+ search terms and their similar concepts.
100+ output_csv_path (str): Path where the output CSV file will be
101+ saved.
102+ limit (int | None, optional): The maximum number of items to
103+ process from the input file. Defaults to None (no limit).
64104 """
65105 try :
66- with open (input_json_path , 'r' ) as f :
106+ with open (input_json_path , "r" , encoding = "utf-8-sig" ) as f :
67107 data = json .load (f )
68108 except FileNotFoundError :
69- print (f"Error: The input JSON file '{ input_json_path } ' was not found." )
109+ print (f"Error: Input file not found at '{ input_json_path } '." )
110+ return
111+ except json .JSONDecodeError as e :
112+ print (f"Error decoding JSON from '{ input_json_path } ': { e } " )
70113 return
71114
72- # Initialise the question-answering pipeline once
73- print ("Loading the deepset/roberta-base-squad2 model..." )
74- qa_pipeline = pipeline (
75- "question-answering" ,
76- model = "deepset/roberta-base-squad2" ,
77- tokenizer = "deepset/roberta-base-squad2"
78- )
79- print ("Model loaded." )
115+ print ("Starting processing with Ollama model..." )
80116
81117 results_for_csv = []
118+ items_to_process = data [:limit ] if limit is not None else data
82119
83- # Use slicing to limit the processing to the first 'limit' items
84- for item in data [:limit ]:
85- raw_event_input = item ['input' ]
86- similar_concepts = item ['similar_concepts' ]
120+ for i , item in enumerate (items_to_process , 1 ):
121+ raw_event_input = item ["input" ]
122+ similar_concepts = item ["similar_concepts" ]
123+
124+ print (
125+ f"Processing item { i } /{ len (items_to_process )} : "
126+ f"'{ raw_event_input } '..."
127+ )
87128
88129 if not similar_concepts :
89130 print (f"No concepts found for '{ raw_event_input } '. Skipping." )
90131 continue
91132
92- # Use the QA model to find the best match from the list of concepts
93- best_match = find_best_match (
94- qa_pipeline ,
95- raw_event_input ,
96- similar_concepts
97- )
98-
99- if best_match :
100- results_for_csv .append ({
101- 'raw_event_input' : raw_event_input ,
102- 'concept_id' : best_match ['id' ],
103- 'concept_name' : best_match ['name' ]
104- })
105- print (f"Processed '{ raw_event_input } ': Best match is ID { best_match ['id' ]} ('{ best_match ['name' ]} ')." ) # noqa: E501
133+ best_match = find_best_match (raw_event_input , similar_concepts )
134+
135+ if best_match and all (k in best_match for k in [
136+ "id" ,
137+ "name" ,
138+ "score"
139+ ]):
140+ results_for_csv .append (
141+ {
142+ "raw_event_input" : raw_event_input ,
143+ "concept_id" : best_match ["id" ],
144+ "concept_name" : best_match ["name" ],
145+ "score" : best_match ["score" ],
146+ }
147+ )
148+ print (
149+ f" -> Match: ID { best_match ['id' ]} "
150+ f"('{ best_match ['name' ]} ') Score: { best_match ['score' ]:.4f} "
151+ )
152+ else :
153+ print (f" -> Could not determine a definitive match for "
154+ f"'{ raw_event_input } '." )
106155
107- # Export to CSV
108156 if results_for_csv :
109157 df = pd .DataFrame (results_for_csv )
110158 df .to_csv (output_csv_path , index = False )
@@ -113,8 +161,9 @@ def process_json_and_export_csv(
113161 print ("\n No results to save." )
114162
115163
116- if __name__ == '__main__' :
117- input_json_file = 'similar_results.json'
118- output_csv_file = 'matches.csv'
164+ if __name__ == "__main__" :
165+ # Define your input and output file paths
166+ input_json_file = "similar_results.json"
167+ output_csv_file = "matches.csv"
119168
120- process_json_and_export_csv (input_json_file , output_csv_file , limit = 10000 )
169+ process_json_and_export_csv (input_json_file , output_csv_file )
0 commit comments