bug fixes

holtyad · holtyad · commit 65fbca53366e · 2025-12-22T09:41:03.000Z
diff --git a/main.py b/main.py
@@ -108,7 +108,7 @@ def main():
 
         embeddings = vectorise_clinical_concepts(
             args.concepts_file,
-            os.path.dirname(args.concepts_file),
+            args.embeddings_file,
             batch_size=32
         )
 
diff --git a/src/omop_rag/create_embeddings.py b/src/omop_rag/create_embeddings.py
@@ -4,15 +4,15 @@
 import os
 
 
-def vectorise_clinical_concepts(csv_file_path, output_dir, batch_size=32):
+def vectorise_clinical_concepts(csv_file_path, output_path, batch_size=32):
     """
     Vectorises a CSV of clinical concepts using the MedEmbed-Large-v1 model.
 
     Args:
         csv_file_path (str): The path to the input CSV file. The CSV must
                              have a column named 'concept_name' containing
                              the text to embed.
-        output_dir (str): The directory to save the output files.
+        output_path (str): The full path to save the embeddings .pt file.
         batch_size (int): The number of concepts to process in each batch.
     """
 
@@ -61,18 +61,19 @@ def vectorise_clinical_concepts(csv_file_path, output_dir, batch_size=32):
     final_embeddings = torch.cat(embeddings)
 
     # Create the output directory if it doesn't exist
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Save embeddings to a file
-    embeddings_file_path = os.path.join(
-        output_dir,
-        'concept_embeddings.pt'
+    output_dir = os.path.dirname(output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+    # Save embeddings to the specified file path
+    torch.save(final_embeddings, output_path)
+    print(f"Embeddings saved to {output_path}")
+
+    # Save the original concepts CSV to the same directory
+    concepts_output_path = os.path.join(
+        output_dir if output_dir else '.',
+        'clinical_concepts.csv'
     )
-    torch.save(final_embeddings, embeddings_file_path)
-    print(f"Embeddings saved to {embeddings_file_path}")
-
-    # Save the original concepts CSV to the output directory as well
-    concepts_output_path = os.path.join(output_dir, 'clinical_concepts.csv')
     df.to_csv(concepts_output_path, index=False)
     print(f"Original concepts saved to {concepts_output_path}")
 

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def main():`
`108`	`108`
`109`	`109`	`embeddings = vectorise_clinical_concepts(`
`110`	`110`	`args.concepts_file,`
`111`		`- os.path.dirname(args.concepts_file),`
	`111`	`+ args.embeddings_file,`
`112`	`112`	`batch_size=32`
`113`	`113`	`)`
`114`	`114`