Skip to content

Commit 6e4c6fb

Browse files
Raga's Evaluation For Multi Modes (#806)
* Updatedmodels for ragas eval * context utilization metrics removed * updated supported llms for ragas * removed context utilization * Implemented Parallel API * multi api calls error resolved * MultiMode Metrics * Fix: Metric Evalution For Single Mode * multi modes ragas evaluation * api payload changes * metric api output format changed * multi mode ragas changes * removed pre process dataset * api response changes * Multimode metrics api integration * nan error for no answer resolved * QA integration changes --------- Co-authored-by: kaustubh-darekar <[email protected]>
1 parent bc8703d commit 6e4c6fb

File tree

10 files changed

+377
-186
lines changed

10 files changed

+377
-186
lines changed

backend/score.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -791,24 +791,34 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
791791
gc.collect()
792792

793793
@app.post('/metric')
794-
async def calculate_metric(question=Form(), context=Form(), answer=Form(), model=Form()):
794+
async def calculate_metric(question: str = Form(),
795+
context: str = Form(),
796+
answer: str = Form(),
797+
model: str = Form(),
798+
mode: str = Form()):
795799
try:
796-
payload_json_obj = {'api_name':'metric', 'context':context, 'answer':answer, 'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc))}
797-
logger.log_struct(payload_json_obj, "INFO")
798-
result = await asyncio.to_thread(get_ragas_metrics, question, context, answer, model)
799-
if result is None or "error" in result:
800-
return create_api_response(
801-
'Failed',
802-
message='Failed to calculate evaluation metrics.',
803-
error=result.get("error", "Ragas evaluation returned null")
804-
)
805-
return create_api_response('Success', data=result)
800+
context_list = [str(item).strip() for item in json.loads(context)] if context else []
801+
answer_list = [str(item).strip() for item in json.loads(answer)] if answer else []
802+
mode_list = [str(item).strip() for item in json.loads(mode)] if mode else []
803+
804+
result = await asyncio.to_thread(
805+
get_ragas_metrics, question, context_list, answer_list, model
806+
)
807+
if result is None or "error" in result:
808+
return create_api_response(
809+
'Failed',
810+
message='Failed to calculate evaluation metrics.',
811+
error=result.get("error", "Ragas evaluation returned null")
812+
)
813+
data = {mode: {metric: result[metric][i] for metric in result} for i, mode in enumerate(mode_list)}
814+
return create_api_response('Success', data=data)
806815
except Exception as e:
807-
job_status = "Failed"
808-
message = "Error while calculating evaluation metrics"
809-
error_message = str(e)
810-
logging.exception(f'{error_message}')
811-
return create_api_response(job_status, message=message, error=error_message)
816+
logging.exception(f"Error while calculating evaluation metrics: {e}")
817+
return create_api_response(
818+
'Failed',
819+
message="Error while calculating evaluation metrics",
820+
error=str(e)
821+
)
812822
finally:
813823
gc.collect()
814824

backend/src/ragas_eval.py

Lines changed: 17 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,52 @@
11
import os
22
import logging
33
import time
4-
from typing import Dict, Tuple, Optional
5-
import boto3
4+
from src.llm import get_llm
65
from datasets import Dataset
76
from dotenv import load_dotenv
8-
from langchain_anthropic import ChatAnthropic
9-
from langchain_aws import ChatBedrock
10-
from langchain_community.chat_models import ChatOllama
11-
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
12-
from langchain_fireworks import ChatFireworks
13-
from langchain_google_vertexai import (
14-
ChatVertexAI,
15-
HarmBlockThreshold,
16-
HarmCategory,
17-
)
18-
from langchain_groq import ChatGroq
19-
from langchain_openai import AzureChatOpenAI, ChatOpenAI
207
from ragas import evaluate
21-
from ragas.metrics import answer_relevancy, context_utilization, faithfulness
8+
from ragas.metrics import answer_relevancy, faithfulness
229
from src.shared.common_fn import load_embedding_model
23-
2410
load_dotenv()
2511

26-
RAGAS_MODEL_VERSIONS = {
27-
"openai_gpt_3.5": "gpt-3.5-turbo-16k",
28-
"openai_gpt_4": "gpt-4-turbo-2024-04-09",
29-
"openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18",
30-
"openai_gpt_4o": "gpt-4o-mini-2024-07-18",
31-
"groq_llama3_70b": "groq_llama3_70b",
32-
}
3312
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
3413
EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)
3514

36-
37-
def get_ragas_llm(model: str) -> Tuple[object, str]:
38-
"""Retrieves the specified language model. Improved error handling and structure."""
39-
env_key = f"LLM_MODEL_CONFIG_{model}"
40-
env_value = os.environ.get(env_key)
41-
logging.info(f"Loading model configuration: {env_key}")
42-
try:
43-
if "openai" in model:
44-
model_name = RAGAS_MODEL_VERSIONS[model]
45-
llm = ChatOpenAI(
46-
api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0
47-
)
48-
elif "groq" in model:
49-
model_name, base_url, api_key = env_value.split(",")
50-
llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)
51-
else:
52-
raise ValueError(f"Unsupported model for evaluation: {model}")
53-
54-
logging.info(f"Model loaded - Model Version: {model}")
55-
return llm, model_name
56-
except (ValueError, KeyError) as e:
57-
logging.error(f"Error loading LLM: {e}")
58-
raise
59-
60-
61-
def get_ragas_metrics(
62-
question: str, context: str, answer: str, model: str
63-
) -> Optional[Dict[str, float]]:
15+
def get_ragas_metrics(question: str, context: list, answer: list, model: str):
6416
"""Calculates RAGAS metrics."""
6517
try:
6618
start_time = time.time()
6719
dataset = Dataset.from_dict(
68-
{"question": [question], "answer": [answer], "contexts": [[context]]}
20+
{"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]}
6921
)
70-
logging.info("Dataset created successfully.")
71-
72-
llm, model_name = get_ragas_llm(model=model)
22+
logging.info("Evaluation dataset created successfully.")
23+
if ("diffbot" in model) or ("ollama" in model):
24+
raise ValueError(f"Unsupported model for evaluation: {model}")
25+
else:
26+
llm, model_name = get_llm(model=model)
27+
7328
logging.info(f"Evaluating with model: {model_name}")
74-
29+
7530
score = evaluate(
7631
dataset=dataset,
77-
metrics=[faithfulness, answer_relevancy, context_utilization],
32+
metrics=[faithfulness, answer_relevancy],
7833
llm=llm,
7934
embeddings=EMBEDDING_FUNCTION,
8035
)
81-
36+
8237
score_dict = (
83-
score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]]
38+
score.to_pandas()[["faithfulness", "answer_relevancy"]]
39+
.fillna(0)
8440
.round(4)
85-
.to_dict(orient="records")[0]
41+
.to_dict(orient="list")
8642
)
8743
end_time = time.time()
8844
logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
8945
return score_dict
9046
except ValueError as e:
9147
if "Unsupported model for evaluation" in str(e):
9248
logging.error(f"Unsupported model error: {e}")
93-
return {"error": str(e)} # Return the specific error message as a dictionary
49+
return {"error": str(e)}
9450
logging.exception(f"ValueError during metrics evaluation: {e}")
9551
return {"error": str(e)}
9652
except Exception as e:

0 commit comments

Comments
 (0)