Skip to content

Commit 4fa115b

Browse files
committed
new feat: add eval and metric codes into the pipeline, and fix the bug of dumping results
1 parent 94ec427 commit 4fa115b

File tree

8 files changed

+307
-241
lines changed

8 files changed

+307
-241
lines changed

evaluation/scripts/temporal_locomo/locomo_eval.py

Lines changed: 126 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
import argparse
22
import asyncio
33
import json
4-
import logging
54
import os
65
import time
76

87
import nltk
98
import numpy as np
10-
import transformers
119

1210
from bert_score import score as bert_score
1311
from dotenv import load_dotenv
14-
from locomo_processor import BASE_DIR
12+
from modules.locomo_eval_module import LocomoEvalModelModules
1513
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
1614
from nltk.translate.meteor_score import meteor_score
1715
from openai import AsyncOpenAI
@@ -21,9 +19,11 @@
2119
from sentence_transformers import SentenceTransformer
2220
from tqdm import tqdm
2321

22+
from memos.log import get_logger
23+
24+
25+
logger = get_logger(__name__)
2426

25-
logging.basicConfig(level=logging.CRITICAL)
26-
transformers.logging.set_verbosity_error()
2727

2828
# Download necessary NLTK resources
2929
try:
@@ -209,7 +209,9 @@ def convert_numpy_types(obj):
209209
return obj
210210

211211

212-
async def process_group_responses(group_id, group_responses, oai_client, options, num_runs: int):
212+
async def process_group_responses(
213+
group_id, group_responses, oai_client, evaluation_options, num_runs: int
214+
):
213215
graded_responses = []
214216

215217
# Process responses with asyncio for concurrent API calls
@@ -232,7 +234,7 @@ async def process_group_responses(group_id, group_responses, oai_client, options
232234
judgments = await asyncio.gather(*grading_tasks)
233235
judgments_dict = {f"judgment_{i + 1}": j for i, j in enumerate(judgments)}
234236

235-
nlp_metrics = calculate_nlp_metrics(ground_truth, answer, context, options)
237+
nlp_metrics = calculate_nlp_metrics(ground_truth, answer, context, evaluation_options)
236238

237239
graded_response = {
238240
"question": question,
@@ -250,113 +252,126 @@ async def process_group_responses(group_id, group_responses, oai_client, options
250252
return group_id, graded_responses
251253

252254

253-
async def process_single_group(group_id, group_responses, oai_client, options, num_runs):
255+
async def process_single_group(group_id, group_responses, oai_client, evaluation_options, num_runs):
254256
try:
255257
start_time = time.time()
256258
result = await process_group_responses(
257-
group_id, group_responses, oai_client, options, num_runs
259+
group_id, group_responses, oai_client, evaluation_options, num_runs
258260
)
259261
end_time = time.time()
260262
elapsed_time = round(end_time - start_time, 2)
261263
print(f"Group {group_id} processed in {elapsed_time} seconds")
262264
return result
263265
except Exception as e:
264-
print(f"Error processing group {group_id}: {e}")
266+
logger.error(f"Error processing group {group_id}: {e}", exc_info=True)
265267
return group_id, []
266268

267269

268-
async def main(frame, version="default", options=None, num_runs=1, max_workers=4):
269-
print(
270-
f"\n=== Starting LoCoMo evaluation for {frame} (version: {version}) with {num_runs} run(s) per question ==="
271-
)
272-
print(f"Using {max_workers} concurrent workers for processing groups")
273-
274-
results_dir = f"{BASE_DIR}/results/locomo/{frame}-{version}"
275-
response_path = f"{results_dir}/{frame}_locomo_responses.json"
276-
judged_path = f"{results_dir}/{frame}_locomo_judged.json"
277-
278-
os.makedirs(results_dir, exist_ok=True)
279-
280-
load_dotenv()
281-
oai_client = AsyncOpenAI(
282-
api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")
283-
)
284-
285-
with open(response_path) as file:
286-
locomo_responses = json.load(file)
287-
288-
num_users = 10
289-
all_grades = {}
290-
291-
total_responses_count = sum(
292-
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
293-
)
294-
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")
295-
296-
# Create tasks for processing each group
297-
tasks = []
298-
active_users = 0
299-
for group_idx in range(num_users):
300-
group_id = f"locomo_exp_user_{group_idx}"
301-
group_responses = locomo_responses.get(group_id, [])
302-
if not group_responses:
303-
print(f"No responses found for group {group_id}")
304-
continue
270+
class LocomoEvaluator(LocomoEvalModelModules):
271+
def __init__(self, args):
272+
# Initialize base class to populate self.frame, self.version, etc.
273+
super().__init__(args=args)
305274

306-
active_users += 1
307-
tasks.append(process_single_group(group_id, group_responses, oai_client, options, num_runs))
275+
self.evaluation_options = getattr(args, "evaluation_options", ["lexical", "semantic"])
276+
self.num_runs = getattr(args, "num_runs", 1)
277+
self.max_workers = getattr(args, "workers", 4)
308278

309-
print(f"Starting evaluation of {active_users} user groups with responses")
310-
311-
semaphore = asyncio.Semaphore(max_workers)
312-
313-
async def limited_task(task):
314-
async with semaphore:
315-
return await task
316-
317-
limited_tasks = [limited_task(task) for task in tasks]
318-
group_results = await asyncio.gather(*limited_tasks)
319-
320-
for group_id, graded_responses in group_results:
321-
all_grades[group_id] = graded_responses
279+
load_dotenv()
280+
self.oai_client = AsyncOpenAI(
281+
api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")
282+
)
322283

323-
print("\n=== Evaluation Complete: Calculating final scores ===")
284+
async def run(self):
285+
print(
286+
f"\n=== Starting LoCoMo evaluation for {self.frame} (version: {self.version}) with {self.num_runs} run(s) per question ==="
287+
)
288+
print(f"Using {self.max_workers} concurrent workers for processing groups")
324289

325-
run_scores = []
326-
evaluated_count = 0
327-
if num_runs > 0:
328-
for i in range(1, num_runs + 1):
329-
judgment_key = f"judgment_{i}"
330-
current_run_correct_count = 0
331-
current_run_total_count = 0
332-
for group in all_grades.values():
333-
for response in group:
334-
if judgment_key in response["llm_judgments"]:
335-
if response["llm_judgments"][judgment_key]:
336-
current_run_correct_count += 1
337-
current_run_total_count += 1
290+
with open(self.response_path) as file:
291+
locomo_responses = json.load(file)
338292

339-
if current_run_total_count > 0:
340-
run_accuracy = current_run_correct_count / current_run_total_count
341-
run_scores.append(run_accuracy)
293+
num_users = 10
294+
all_grades = {}
342295

343-
evaluated_count = current_run_total_count
296+
total_responses_count = sum(
297+
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
298+
)
299+
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")
300+
301+
# Create tasks for processing each group
302+
tasks = []
303+
active_users = 0
304+
for group_idx in range(num_users):
305+
group_id = f"locomo_exp_user_{group_idx}"
306+
group_responses = locomo_responses.get(group_id, [])
307+
if not group_responses:
308+
print(f"No responses found for group {group_id}")
309+
continue
310+
311+
active_users += 1
312+
tasks.append(
313+
process_single_group(
314+
group_id=group_id,
315+
group_responses=group_responses,
316+
oai_client=self.oai_client,
317+
evaluation_options=self.evaluation_options,
318+
num_runs=self.num_runs,
319+
)
320+
)
344321

345-
if evaluated_count > 0:
346-
mean_of_scores = np.mean(run_scores)
347-
std_of_scores = np.std(run_scores)
348-
print(f"LLM-as-a-Judge Mean Score: {mean_of_scores:.4f}")
349-
print(f"LLM-as-a-Judge Standard Deviation: {std_of_scores:.4f}")
350-
print(f"(Calculated from {num_runs} separate runs over {evaluated_count} questions)")
351-
print(f"Individual run scores: {[round(s, 4) for s in run_scores]}")
352-
else:
353-
print("No responses were evaluated")
354-
print("LLM-as-a-Judge score: N/A (0/0)")
322+
print(f"Starting evaluation of {active_users} user groups with responses")
323+
324+
semaphore = asyncio.Semaphore(self.max_workers)
325+
326+
async def limited_task(task):
327+
async with semaphore:
328+
return await task
329+
330+
limited_tasks = [limited_task(task) for task in tasks]
331+
group_results = await asyncio.gather(*limited_tasks)
332+
333+
for group_id, graded_responses in group_results:
334+
all_grades[group_id] = graded_responses
335+
336+
print("\n=== Evaluation Complete: Calculating final scores ===")
337+
338+
run_scores = []
339+
evaluated_count = 0
340+
if self.num_runs > 0:
341+
for i in range(1, self.num_runs + 1):
342+
judgment_key = f"judgment_{i}"
343+
current_run_correct_count = 0
344+
current_run_total_count = 0
345+
for group in all_grades.values():
346+
for response in group:
347+
if judgment_key in response["llm_judgments"]:
348+
if response["llm_judgments"][judgment_key]:
349+
current_run_correct_count += 1
350+
current_run_total_count += 1
351+
352+
if current_run_total_count > 0:
353+
run_accuracy = current_run_correct_count / current_run_total_count
354+
run_scores.append(run_accuracy)
355+
356+
evaluated_count = current_run_total_count
357+
358+
if evaluated_count > 0:
359+
mean_of_scores = np.mean(run_scores)
360+
std_of_scores = np.std(run_scores)
361+
print(f"LLM-as-a-Judge Mean Score: {mean_of_scores:.4f}")
362+
print(f"LLM-as-a-Judge Standard Deviation: {std_of_scores:.4f}")
363+
print(
364+
f"(Calculated from {self.num_runs} separate runs over {evaluated_count} questions)"
365+
)
366+
print(f"Individual run scores: {[round(s, 4) for s in run_scores]}")
367+
else:
368+
print("No responses were evaluated")
369+
print("LLM-as-a-Judge score: N/A (0/0)")
355370

356-
all_grades = convert_numpy_types(all_grades)
357-
with open(judged_path, "w") as f:
358-
json.dump(all_grades, f, indent=2)
359-
print(f"Saved detailed evaluation results to {judged_path}")
371+
all_grades = convert_numpy_types(all_grades)
372+
with open(self.judged_path, "w") as f:
373+
json.dump(all_grades, f, indent=2)
374+
print(f"Saved detailed evaluation results to {self.judged_path}")
360375

361376

362377
if __name__ == "__main__":
@@ -380,10 +395,23 @@ async def limited_task(task):
380395
default=3,
381396
help="Number of times to run the LLM grader for each question",
382397
)
383-
parser.add_argument("--options", nargs="+", default=["lexical", "semantic"])
398+
parser.add_argument("--evaluation_options", nargs="+", default=["lexical", "semantic"])
384399
parser.add_argument(
385400
"--workers", type=int, default=10, help="Number of concurrent workers for processing groups"
386401
)
387-
args = parser.parse_args()
388-
389-
asyncio.run(main(args.lib, args.version, args.options, args.num_runs, args.workers))
402+
cli_args = parser.parse_args()
403+
404+
# Build args for evaluator
405+
class Args:
406+
def __init__(self, cli_args):
407+
self.frame = cli_args.lib
408+
self.version = cli_args.version
409+
self.workers = cli_args.workers
410+
self.num_runs = cli_args.num_runs
411+
self.evaluation_options = cli_args.evaluation_options
412+
self.top_k = 20
413+
self.scheduler_flag = True
414+
415+
args = Args(cli_args)
416+
evaluator = LocomoEvaluator(args=args)
417+
asyncio.run(evaluator.run())

0 commit comments

Comments
 (0)