neo4j-labs
diff --git a/‎backend/score.py‎
Lines changed: 26 additions & 16 deletions b/‎backend/score.py‎
Lines changed: 26 additions & 16 deletions
diff --git a/‎backend/src/ragas_eval.py‎
Lines changed: 17 additions & 61 deletions b/‎backend/src/ragas_eval.py‎
Lines changed: 17 additions & 61 deletions
@@ -791,24 +791,34 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
         gc.collect()    
 
 @app.post('/metric')
-async def calculate_metric(question=Form(), context=Form(), answer=Form(), model=Form()):
+async def calculate_metric(question: str = Form(),
+                           context: str = Form(),
+                           answer: str = Form(),
+                           model: str = Form(),
+                           mode: str = Form()):
    try:
-        payload_json_obj = {'api_name':'metric', 'context':context, 'answer':answer, 'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc))}
-        logger.log_struct(payload_json_obj, "INFO")
-        result = await asyncio.to_thread(get_ragas_metrics, question, context, answer, model)
-        if result is None or "error" in result:
-            return create_api_response(
-                'Failed',
-                message='Failed to calculate evaluation metrics.',
-                error=result.get("error", "Ragas evaluation returned null")
-            )
-        return create_api_response('Success', data=result)
+       context_list = [str(item).strip() for item in json.loads(context)] if context else []
+       answer_list = [str(item).strip() for item in json.loads(answer)] if answer else []
+       mode_list = [str(item).strip() for item in json.loads(mode)] if mode else []
+
+       result = await asyncio.to_thread(
+           get_ragas_metrics, question, context_list, answer_list, model
+       )
+       if result is None or "error" in result:
+           return create_api_response(
+               'Failed',
+               message='Failed to calculate evaluation metrics.',
+               error=result.get("error", "Ragas evaluation returned null")
+           )
+       data = {mode: {metric: result[metric][i] for metric in result} for i, mode in enumerate(mode_list)}
+       return create_api_response('Success', data=data)
    except Exception as e:
-       job_status = "Failed"
-       message = "Error while calculating evaluation metrics"
-       error_message = str(e)
-       logging.exception(f'{error_message}')
-       return create_api_response(job_status, message=message, error=error_message)
+       logging.exception(f"Error while calculating evaluation metrics: {e}")
+       return create_api_response(
+           'Failed',
+           message="Error while calculating evaluation metrics",
+           error=str(e)
+       )
    finally:
        gc.collect()
 
 
@@ -1,96 +1,52 @@
 import os
 import logging
 import time
-from typing import Dict, Tuple, Optional
-import boto3
+from src.llm import get_llm
 from datasets import Dataset
 from dotenv import load_dotenv
-from langchain_anthropic import ChatAnthropic
-from langchain_aws import ChatBedrock
-from langchain_community.chat_models import ChatOllama
-from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
-from langchain_fireworks import ChatFireworks
-from langchain_google_vertexai import (
-    ChatVertexAI,
-    HarmBlockThreshold,
-    HarmCategory,
-)
-from langchain_groq import ChatGroq
-from langchain_openai import AzureChatOpenAI, ChatOpenAI
 from ragas import evaluate
-from ragas.metrics import answer_relevancy, context_utilization, faithfulness
+from ragas.metrics import answer_relevancy, faithfulness
 from src.shared.common_fn import load_embedding_model 
-
 load_dotenv()
 
-RAGAS_MODEL_VERSIONS = {
-    "openai_gpt_3.5": "gpt-3.5-turbo-16k",
-    "openai_gpt_4": "gpt-4-turbo-2024-04-09",
-    "openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18",
-    "openai_gpt_4o": "gpt-4o-mini-2024-07-18",
-    "groq_llama3_70b": "groq_llama3_70b",
-}
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)
 
-
-def get_ragas_llm(model: str) -> Tuple[object, str]:
-    """Retrieves the specified language model.  Improved error handling and structure."""
-    env_key = f"LLM_MODEL_CONFIG_{model}"
-    env_value = os.environ.get(env_key)
-    logging.info(f"Loading model configuration: {env_key}")
-    try:
-        if "openai" in model:
-            model_name = RAGAS_MODEL_VERSIONS[model]
-            llm = ChatOpenAI(
-                api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0
-            )
-        elif "groq" in model:
-            model_name, base_url, api_key = env_value.split(",")
-            llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)
-        else:
-            raise ValueError(f"Unsupported model for evaluation: {model}")
-
-        logging.info(f"Model loaded - Model Version: {model}")
-        return llm, model_name
-    except (ValueError, KeyError) as e:
-        logging.error(f"Error loading LLM: {e}")
-        raise
-
-
-def get_ragas_metrics(
-    question: str, context: str, answer: str, model: str
-) -> Optional[Dict[str, float]]:
+def get_ragas_metrics(question: str, context: list, answer: list, model: str):
     """Calculates RAGAS metrics."""
     try:
         start_time = time.time()
         dataset = Dataset.from_dict(
-            {"question": [question], "answer": [answer], "contexts": [[context]]}
+            {"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]}
         )
-        logging.info("Dataset created successfully.")
-
-        llm, model_name = get_ragas_llm(model=model)
+        logging.info("Evaluation dataset created successfully.")
+        if ("diffbot" in model) or ("ollama" in model):
+            raise ValueError(f"Unsupported model for evaluation: {model}")
+        else:
+            llm, model_name = get_llm(model=model)
+    
         logging.info(f"Evaluating with model: {model_name}")
-       
+
         score = evaluate(
             dataset=dataset,
-            metrics=[faithfulness, answer_relevancy, context_utilization],
+            metrics=[faithfulness, answer_relevancy],
             llm=llm,
             embeddings=EMBEDDING_FUNCTION,
         )
-
+        
         score_dict = (
-            score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]]
+            score.to_pandas()[["faithfulness", "answer_relevancy"]]
+            .fillna(0)
             .round(4)
-            .to_dict(orient="records")[0]
+            .to_dict(orient="list")
         ) 
         end_time = time.time()
         logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
         return score_dict
     except ValueError as e:
        if "Unsupported model for evaluation" in str(e):
            logging.error(f"Unsupported model error: {e}")
-           return {"error": str(e)}  # Return the specific error message as a dictionary
+           return {"error": str(e)} 
        logging.exception(f"ValueError during metrics evaluation: {e}")
        return {"error": str(e)}
     except Exception as e: