-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluations.py
More file actions
144 lines (117 loc) · 5.67 KB
/
evaluations.py
File metadata and controls
144 lines (117 loc) · 5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# evaluations.py
# This file contains functions for evaluating the RAG system's responses
# using cosine similarity against ground truth answers.
import google.generativeai as genai
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import os
import argparse # For command-line arguments
import config # Assuming config.py has GOOGLE_API_KEY and embedding model
class EvaluationRunner:
"""
A class for calculating cosine similarity scores between questions,
ground truth answers, and LLM-generated answers.
"""
def __init__(self):
"""
Initializes the evaluator with the embedding model.
"""
genai.configure(api_key=config.GOOGLE_API_KEY)
self.embedding_model_name = config.DEFAULT_GOOGLE_EMBEDDING_MODEL
def get_embedding(self, text: str, task_type: str = "SEMANTIC_SIMILARITY") -> list[float]:
"""
Generates an embedding for a given text string.
Uses SEMANTIC_SIMILARITY task type for general text comparison.
Args:
text: The input string.
task_type: The task type for the embedding model.
Returns:
A list of floats representing the embedding, or an empty list if an error occurs.
"""
if not text:
return []
try:
embedding_response = genai.embed_content(
model=self.embedding_model_name,
content=text,
task_type=task_type
)
return embedding_response['embedding']
except Exception as e:
print(f"Error generating embedding for text: '{text[:50]}...': {e}")
return []
def calculate_similarity(self, emb1: list[float], emb2: list[float]) -> float:
"""
Calculates the cosine similarity between two embedding vectors.
Args:
emb1: The first embedding vector.
emb2: The second embedding vector.
Returns:
The cosine similarity score (float between -1.0 and 1.0), or 0.0 if inputs are invalid.
"""
if not emb1 or not emb2:
return 0.0
# Reshape for sklearn's cosine_similarity
return cosine_similarity(np.array(emb1).reshape(1, -1), np.array(emb2).reshape(1, -1))[0][0]
def evaluate_from_json(self, json_file_path: str):
"""
Loads evaluation data from a JSON file, calculates and prints similarity scores.
Args:
json_file_path: Path to the JSON file containing evaluation data.
"""
if not os.path.exists(json_file_path):
print(f"Error: JSON file not found at '{json_file_path}'")
return
with open(json_file_path, 'r', encoding='utf-8') as f:
evaluation_data = json.load(f)
if not isinstance(evaluation_data, list):
print("Error: JSON file content should be a list of evaluation objects.")
return
print(f"\n--- Starting Evaluation from '{json_file_path}' ---")
# Removed q_gt_similarities list
q_llm_similarities = []
gt_llm_similarities = []
for i, entry in enumerate(evaluation_data):
question = entry.get("question", "")
ground_truth = entry.get("ground_truth_answer", "")
llm_answer = entry.get("llm_answer", "")
if not question or not ground_truth or not llm_answer:
print(f"Warning: Skipping entry {i+1} due to missing data.")
continue
print(f"\n--- Entry {i+1} ---")
print(f"Question: {question[:80]}...")
print(f"Ground Truth: {ground_truth[:80]}...")
print(f"LLM Answer: {llm_answer[:80]}...")
# Generate embeddings
q_emb = self.get_embedding(question, task_type="RETRIEVAL_QUERY") # Query task type for question
gt_emb = self.get_embedding(ground_truth, task_type="RETRIEVAL_DOCUMENT") # Document task type for answers
llm_emb = self.get_embedding(llm_answer, task_type="RETRIEVAL_DOCUMENT") # Document task type for answers
if not q_emb or not gt_emb or not llm_emb:
print(f" Skipping similarity calculations for entry {i+1} due to embedding errors.")
continue
# Removed q_gt_sim calculation
q_llm_sim = self.calculate_similarity(q_emb, llm_emb)
gt_llm_sim = self.calculate_similarity(gt_emb, llm_emb)
# Removed appending to q_gt_similarities
q_llm_similarities.append(q_llm_sim)
gt_llm_similarities.append(gt_llm_sim)
# Removed Q-GT Similarity print
print(f" Q-LLM Similarity (Question vs. LLM Answer): {q_llm_sim:.4f}")
print(f" GT-LLM Similarity (Ground Truth vs. LLM Answer): {gt_llm_sim:.4f}")
print("\n--- Evaluation Summary ---")
# Changed condition to check if any similarities were calculated
if q_llm_similarities and gt_llm_similarities:
# Removed Average Q-GT Similarity print
print(f"Average Q-LLM Similarity: {np.mean(q_llm_similarities):.4f}")
print(f"Average GT-LLM Similarity: {np.mean(gt_llm_similarities):.4f}")
else:
print("No valid entries were processed for evaluation.")
print("--- Evaluation Complete ---\n")
# Command-line execution
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate chatbot responses using cosine similarity from a JSON file.")
parser.add_argument("json_file", type=str, help="Path to the JSON file containing evaluation data.")
args = parser.parse_args()
evaluator = EvaluationRunner()
evaluator.evaluate_from_json(args.json_file)