answerdigital
diff --git a/‎data/lab/matches.csv‎
Lines changed: 9 additions & 9 deletions b/‎data/lab/matches.csv‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/omop_rag/best_match.py‎
Lines changed: 129 additions & 80 deletions b/‎src/omop_rag/best_match.py‎
Lines changed: 129 additions & 80 deletions
diff --git a/‎src/omop_rag/prompt.py‎
Lines changed: 45 additions & 0 deletions b/‎src/omop_rag/prompt.py‎
Lines changed: 45 additions & 0 deletions
@@ -1,9 +1,9 @@
-raw_event_input,concept_id,concept_name
-Haemoglobin levels in blood,3005872,Hemoglobin [Presence] in Blood
-creatinine levels in blood,3051825,Creatinine [Mass/volume] in Blood
-o2 sat test,3016502,Oxygen saturation in Arterial blood
-ph blood,3010421,pH of Blood
-potassium levels in blood,21490733,Potassium [Mass/volume] in Blood
-na levels in blood,3000285,Sodium [Moles/volume] in Blood
-blood co2,3013290,Carbon dioxide [Partial pressure] in Blood
-wbc count,3002317,Cells Counted Total [#] in Blood
+raw_event_input,concept_id,concept_name,score
+Haemoglobin levels in blood,3005872,Hemoglobin [Presence] in Blood,0.866
+creatinine levels in blood,46235076,"Creatinine [Moles/volume] in Serum, Plasma or Blood",0.8512
+o2 sat test,3016502,Oxygen saturation in Arterial blood,0.6807
+ph blood,3010421,pH of Blood,0.9425
+potassium levels in blood,46235078,"Potassium [Moles/volume] in Serum, Plasma or Blood",0.8501
+na levels in blood,42528959,Sodium [Mass/volume] in Red Blood Cells,0.8452
+blood co2,3027946,Carbon dioxide [Partial pressure] in Arterial blood,0.8461
+wbc count,3003282,Leukocytes [#/volume] in Blood by Manual count,0.7607
@@ -1,110 +1,158 @@
-import pandas as pd
 import json
-from transformers import pipeline
+import re
 
+import pandas as pd
+import requests
 
-def find_best_match(
-    qa_pipeline,
-    question,
-    concepts_list
-):
-    """
-    Finds the best matching concept from a list using a question-answer model.
+
+def find_best_match(input_term: str, concepts_list: list) -> dict | None:
+    """Find the best matching concept using a request to an Ollama model.
 
     Args:
-        qa_pipeline: The pre-initialised question-answering pipeline.
-        question (str): The question to ask (e.g., the raw event name).
-        concepts_list (list): A list of dictionaries, where each dict
-                              represents a concept.
+        input_term (str): The free-text input term to be matched.
+        concepts_list (list): A list of candidate concept dictionaries,
+            each expected to have 'id', 'name', and 'score'.
 
     Returns:
-        dict: The best matching concept dictionary (id and name), or None
-              if no match is found.
+        A dictionary representing the best matching concept chosen by the
+        LLM, or the top vector search result as a fallback. Returns None
+        if an API error occurs and there are no concepts to fall back to.
     """
+    concepts_str = json.dumps(concepts_list, indent=4)
+
+    prompt = f"""
+You are a highly skilled Clinical Terminologist and Medical Informatics
+expert. Your task is to match the 'FREE TEXT INPUT' to the most precise
+concept from the 'SIMILAR CONCEPTS' list. You must use the provided score
+as a guide but prioritize clinical accuracy.
+
+**RULES FOR SELECTION:**
+1. Your answer MUST be one of the options from the 'SIMILAR CONCEPTS' list.
+2. Prioritize clinical accuracy. 'levels' or 'test' usually implies a
+   quantitative measure like [Mass/volume] or [#/volume].
+3. Choose the most general correct option unless the input specifies
+   otherwise (e.g., prefer 'Blood' over 'Arterial blood').
+
+**FREE TEXT INPUT:**
+{input_term}
+
+**SIMILAR CONCEPTS:**
+{concepts_str}
+
+**TASK:**
+Return a single JSON object for the best matching concept. This object
+must include the id, name, and the original score. Do not add any other
+text or explanation.
+
+**TARGET OUTPUT FORMAT:**
+{{
+    "id": 1234567,
+    "name": "Concept Name from the list",
+    "score": 0.9876
+}}
+"""
+
+    ollama_api_url = "http://localhost:11434/api/generate"
+    payload = {
+        "model": "qwen3:1.7b",
+        "prompt": prompt,
+        "stream": False,
+        "options": {"temperature": 0.0},
+    }
 
-    context = ". ".join([c['name'] for c in concepts_list]) + "."
-
-    # The question-answering model will find the most likely answer span
-    # in the context. We formulate the question to find the concept name.
-    result = qa_pipeline(
-        question=f"""
-        You are a clinical and lab test specialist. Here is a set
-        of 10 closely matched lab tests to this lab test: '{question}'.
-        Select the single closest match?""",
-        context=context
-    )
-
-    # Find the concept that contains the predicted answer string
-    for concept in concepts_list:
-        if result['answer'].strip().lower() in concept['name'].strip().lower():
-            return concept
+    try:
+        response = requests.post(ollama_api_url, json=payload, timeout=60)
+        response.raise_for_status()
+        response_data = response.json()
+        model_output_str = response_data.get("response", "{}").strip()
+
+        # Extract the JSON object from the model's potentially noisy output.
+        match = re.search(r"\{.*\}", model_output_str, re.DOTALL)
+        if match:
+            clean_json_str = match.group(0)
+            best_match = json.loads(clean_json_str)
+            return best_match
+
+        raise json.JSONDecodeError(
+            "No JSON object found in model output.", model_output_str, 0
+        )
 
-    # If a direct match isn't found, fall back to the highest-scoring concept
-    # from the vector search
-    print("""
-Warning: QA model's answer did not directly match a concept.
-Falling back to top vector search result.
-        """)
-    return concepts_list[0]
+    except requests.exceptions.RequestException as e:
+        print(f"Error communicating with Ollama API: {e}")
+        return None
+    except json.JSONDecodeError:
+        print(
+            f"Warning: Failed to decode JSON for input '{input_term}'."
+            f"\nModel output was: {model_output_str}"
+            "\nFalling back to top vector search result."
+        )
+        return concepts_list[0] if concepts_list else None
 
 
 def process_json_and_export_csv(
-    input_json_path,
-    output_csv_path,
-    limit=5
+    input_json_path: str, output_csv_path: str, limit: int | None = None
 ):
-    """
-    Processes the JSON output from the vector search, uses a QA model to find
-    the best match for each input event, and exports the results to a CSV file.
+    """Process vector search results and export LLM-validated matches.
 
     Args:
-        input_json_path (str): The path to the input JSON file.
-        output_csv_path (str): The path to the output CSV file.
-        limit (int): The maximum number of rows to process.
+        input_json_path (str): Path to the input JSON file containing
+            search terms and their similar concepts.
+        output_csv_path (str): Path where the output CSV file will be
+            saved.
+        limit (int | None, optional): The maximum number of items to
+            process from the input file. Defaults to None (no limit).
     """
     try:
-        with open(input_json_path, 'r') as f:
+        with open(input_json_path, "r", encoding="utf-8-sig") as f:
             data = json.load(f)
     except FileNotFoundError:
-        print(f"Error: The input JSON file '{input_json_path}' was not found.")
+        print(f"Error: Input file not found at '{input_json_path}'.")
+        return
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON from '{input_json_path}': {e}")
         return
 
-    # Initialise the question-answering pipeline once
-    print("Loading the deepset/roberta-base-squad2 model...")
-    qa_pipeline = pipeline(
-        "question-answering",
-        model="deepset/roberta-base-squad2",
-        tokenizer="deepset/roberta-base-squad2"
-    )
-    print("Model loaded.")
+    print("Starting processing with Ollama model...")
 
     results_for_csv = []
+    items_to_process = data[:limit] if limit is not None else data
 
-    # Use slicing to limit the processing to the first 'limit' items
-    for item in data[:limit]:
-        raw_event_input = item['input']
-        similar_concepts = item['similar_concepts']
+    for i, item in enumerate(items_to_process, 1):
+        raw_event_input = item["input"]
+        similar_concepts = item["similar_concepts"]
+
+        print(
+            f"Processing item {i}/{len(items_to_process)}: "
+            f"'{raw_event_input}'..."
+        )
 
         if not similar_concepts:
             print(f"No concepts found for '{raw_event_input}'. Skipping.")
             continue
 
-        # Use the QA model to find the best match from the list of concepts
-        best_match = find_best_match(
-            qa_pipeline,
-            raw_event_input,
-            similar_concepts
-        )
-
-        if best_match:
-            results_for_csv.append({
-                'raw_event_input': raw_event_input,
-                'concept_id': best_match['id'],
-                'concept_name': best_match['name']
-            })
-            print(f"Processed '{raw_event_input}': Best match is ID {best_match['id']} ('{best_match['name']}').")  # noqa: E501
+        best_match = find_best_match(raw_event_input, similar_concepts)
+
+        if best_match and all(k in best_match for k in [
+            "id",
+            "name",
+            "score"
+        ]):
+            results_for_csv.append(
+                {
+                    "raw_event_input": raw_event_input,
+                    "concept_id": best_match["id"],
+                    "concept_name": best_match["name"],
+                    "score": best_match["score"],
+                }
+            )
+            print(
+                f" -> Match: ID {best_match['id']} "
+                f"('{best_match['name']}') Score: {best_match['score']:.4f}"
+            )
+        else:
+            print(f" -> Could not determine a definitive match for "
+                  f"'{raw_event_input}'.")
 
-    # Export to CSV
     if results_for_csv:
         df = pd.DataFrame(results_for_csv)
         df.to_csv(output_csv_path, index=False)
@@ -113,8 +161,9 @@ def process_json_and_export_csv(
         print("\nNo results to save.")
 
 
-if __name__ == '__main__':
-    input_json_file = 'similar_results.json'
-    output_csv_file = 'matches.csv'
+if __name__ == "__main__":
+    # Define your input and output file paths
+    input_json_file = "similar_results.json"
+    output_csv_file = "matches.csv"
 
-    process_json_and_export_csv(input_json_file, output_csv_file, limit=10000)
+    process_json_and_export_csv(input_json_file, output_csv_file)
@@ -0,0 +1,45 @@
+input_text = ""
+input_json = {}
+
+prompt = f"""
+You are a highly skilled Clinical Terminologist and Medical Informatics expert. Your task is to match a free-text lab test input to the most precise and representative concept from a list of similar candidates, prioritizing clinical accuracy over the provided score alone.
+
+**RULES FOR SELECTION:**
+1.  **Direct Synonymy:** The concept must use the correct medical synonym (e.g., 'Leukocytes' for 'WBC').
+2.  **Precision:** The concept must accurately reflect the measurement (e.g., 'count' maps to '[volume]').
+
+**FREE TEXT INPUT:**
+creatinine levels blood
+
+**SIMILAR CONCEPTS (including ID, Name, and raw Score):**
+
+      {
+        "id": 3051825,
+        "name": "Creatinine [Mass/volume] in Blood",
+        "score": 0.8939
+      },
+      {
+        "id": 40762887,
+        "name": "Creatinine [Moles/volume] in Blood",
+        "score": 0.8774
+      },
+      {
+        "id": 3007760,
+        "name": "Creatinine [Mass/volume] in Arterial blood",
+        "score": 0.8654
+      }
+
+**TASK:**
+1.  State the **Closest Matched Concept** (ID and Name) ONLY, no explaination or formatting, just the JSON response.
+
+**TARGET OUTPUT FORMAT:**
+
+{{
+    'input_term': 'wbc count',
+    'id': 3010813,
+    'name': 'Leukocytes [volume] in Blood',
+}}
+
+"""
+
+print(prompt)