|
1 | 1 | import os |
2 | 2 | import logging |
3 | 3 | import time |
4 | | -from typing import Dict, Tuple, Optional |
5 | | -import boto3 |
| 4 | +from src.llm import get_llm |
6 | 5 | from datasets import Dataset |
7 | 6 | from dotenv import load_dotenv |
8 | | -from langchain_anthropic import ChatAnthropic |
9 | | -from langchain_aws import ChatBedrock |
10 | | -from langchain_community.chat_models import ChatOllama |
11 | | -from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer |
12 | | -from langchain_fireworks import ChatFireworks |
13 | | -from langchain_google_vertexai import ( |
14 | | - ChatVertexAI, |
15 | | - HarmBlockThreshold, |
16 | | - HarmCategory, |
17 | | -) |
18 | | -from langchain_groq import ChatGroq |
19 | | -from langchain_openai import AzureChatOpenAI, ChatOpenAI |
20 | 7 | from ragas import evaluate |
21 | | -from ragas.metrics import answer_relevancy, context_utilization, faithfulness |
| 8 | +from ragas.metrics import answer_relevancy, faithfulness |
22 | 9 | from src.shared.common_fn import load_embedding_model |
23 | | - |
24 | 10 | load_dotenv() |
25 | 11 |
|
26 | | -RAGAS_MODEL_VERSIONS = { |
27 | | - "openai_gpt_3.5": "gpt-3.5-turbo-16k", |
28 | | - "openai_gpt_4": "gpt-4-turbo-2024-04-09", |
29 | | - "openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18", |
30 | | - "openai_gpt_4o": "gpt-4o-mini-2024-07-18", |
31 | | - "groq_llama3_70b": "groq_llama3_70b", |
32 | | -} |
33 | | -EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") |
| 12 | +EMBEDDING_MODEL = os.getenv("RAGAS_EMBEDDING_MODEL") |
34 | 13 | EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL) |
35 | 14 |
|
36 | | - |
37 | | -def get_ragas_llm(model: str) -> Tuple[object, str]: |
38 | | - """Retrieves the specified language model. Improved error handling and structure.""" |
39 | | - env_key = f"LLM_MODEL_CONFIG_{model}" |
40 | | - env_value = os.environ.get(env_key) |
41 | | - logging.info(f"Loading model configuration: {env_key}") |
42 | | - try: |
43 | | - if "openai" in model: |
44 | | - model_name = RAGAS_MODEL_VERSIONS[model] |
45 | | - llm = ChatOpenAI( |
46 | | - api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0 |
47 | | - ) |
48 | | - elif "groq" in model: |
49 | | - model_name, base_url, api_key = env_value.split(",") |
50 | | - llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0) |
51 | | - else: |
52 | | - raise ValueError(f"Unsupported model for evaluation: {model}") |
53 | | - |
54 | | - logging.info(f"Model loaded - Model Version: {model}") |
55 | | - return llm, model_name |
56 | | - except (ValueError, KeyError) as e: |
57 | | - logging.error(f"Error loading LLM: {e}") |
58 | | - raise |
59 | | - |
60 | | - |
61 | | -def get_ragas_metrics( |
62 | | - question: str, context: str, answer: str, model: str |
63 | | -) -> Optional[Dict[str, float]]: |
| 15 | +def get_ragas_metrics(question: str, context: list, answer: list, model: str): |
64 | 16 | """Calculates RAGAS metrics.""" |
65 | 17 | try: |
66 | 18 | start_time = time.time() |
67 | 19 | dataset = Dataset.from_dict( |
68 | | - {"question": [question], "answer": [answer], "contexts": [[context]]} |
| 20 | + {"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]} |
69 | 21 | ) |
70 | | - logging.info("Dataset created successfully.") |
71 | | - |
72 | | - llm, model_name = get_ragas_llm(model=model) |
| 22 | + logging.info("Evaluation dataset created successfully.") |
| 23 | + if ("diffbot" in model) or ("ollama" in model): |
| 24 | + raise ValueError(f"Unsupported model for evaluation: {model}") |
| 25 | + else: |
| 26 | + llm, model_name = get_llm(model=model) |
| 27 | + |
73 | 28 | logging.info(f"Evaluating with model: {model_name}") |
74 | | - |
| 29 | + |
75 | 30 | score = evaluate( |
76 | 31 | dataset=dataset, |
77 | | - metrics=[faithfulness, answer_relevancy, context_utilization], |
| 32 | + metrics=[faithfulness, answer_relevancy], |
78 | 33 | llm=llm, |
79 | 34 | embeddings=EMBEDDING_FUNCTION, |
80 | 35 | ) |
81 | | - |
| 36 | + |
82 | 37 | score_dict = ( |
83 | | - score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]] |
| 38 | + score.to_pandas()[["faithfulness", "answer_relevancy"]] |
| 39 | + .fillna(0) |
84 | 40 | .round(4) |
85 | | - .to_dict(orient="records")[0] |
| 41 | + .to_dict(orient="list") |
86 | 42 | ) |
87 | 43 | end_time = time.time() |
88 | 44 | logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds") |
89 | 45 | return score_dict |
90 | 46 | except ValueError as e: |
91 | 47 | if "Unsupported model for evaluation" in str(e): |
92 | 48 | logging.error(f"Unsupported model error: {e}") |
93 | | - return {"error": str(e)} # Return the specific error message as a dictionary |
| 49 | + return {"error": str(e)} |
94 | 50 | logging.exception(f"ValueError during metrics evaluation: {e}") |
95 | 51 | return {"error": str(e)} |
96 | 52 | except Exception as e: |
|
0 commit comments