11import argparse
22import asyncio
33import json
4- import logging
54import os
65import time
76
87import nltk
98import numpy as np
10- import transformers
119
1210from bert_score import score as bert_score
1311from dotenv import load_dotenv
14- from locomo_processor import BASE_DIR
12+ from modules . locomo_eval_module import LocomoEvalModelModules
1513from nltk .translate .bleu_score import SmoothingFunction , sentence_bleu
1614from nltk .translate .meteor_score import meteor_score
1715from openai import AsyncOpenAI
2119from sentence_transformers import SentenceTransformer
2220from tqdm import tqdm
2321
22+ from memos .log import get_logger
23+
24+
25+ logger = get_logger (__name__ )
2426
25- logging .basicConfig (level = logging .CRITICAL )
26- transformers .logging .set_verbosity_error ()
2727
2828# Download necessary NLTK resources
2929try :
@@ -209,7 +209,9 @@ def convert_numpy_types(obj):
209209 return obj
210210
211211
212- async def process_group_responses (group_id , group_responses , oai_client , options , num_runs : int ):
212+ async def process_group_responses (
213+ group_id , group_responses , oai_client , evaluation_options , num_runs : int
214+ ):
213215 graded_responses = []
214216
215217 # Process responses with asyncio for concurrent API calls
@@ -232,7 +234,7 @@ async def process_group_responses(group_id, group_responses, oai_client, options
232234 judgments = await asyncio .gather (* grading_tasks )
233235 judgments_dict = {f"judgment_{ i + 1 } " : j for i , j in enumerate (judgments )}
234236
235- nlp_metrics = calculate_nlp_metrics (ground_truth , answer , context , options )
237+ nlp_metrics = calculate_nlp_metrics (ground_truth , answer , context , evaluation_options )
236238
237239 graded_response = {
238240 "question" : question ,
@@ -250,113 +252,126 @@ async def process_group_responses(group_id, group_responses, oai_client, options
250252 return group_id , graded_responses
251253
252254
253- async def process_single_group (group_id , group_responses , oai_client , options , num_runs ):
255+ async def process_single_group (group_id , group_responses , oai_client , evaluation_options , num_runs ):
254256 try :
255257 start_time = time .time ()
256258 result = await process_group_responses (
257- group_id , group_responses , oai_client , options , num_runs
259+ group_id , group_responses , oai_client , evaluation_options , num_runs
258260 )
259261 end_time = time .time ()
260262 elapsed_time = round (end_time - start_time , 2 )
261263 print (f"Group { group_id } processed in { elapsed_time } seconds" )
262264 return result
263265 except Exception as e :
264- print (f"Error processing group { group_id } : { e } " )
266+ logger . error (f"Error processing group { group_id } : { e } " , exc_info = True )
265267 return group_id , []
266268
267269
268- async def main (frame , version = "default" , options = None , num_runs = 1 , max_workers = 4 ):
269- print (
270- f"\n === Starting LoCoMo evaluation for { frame } (version: { version } ) with { num_runs } run(s) per question ==="
271- )
272- print (f"Using { max_workers } concurrent workers for processing groups" )
273-
274- results_dir = f"{ BASE_DIR } /results/locomo/{ frame } -{ version } "
275- response_path = f"{ results_dir } /{ frame } _locomo_responses.json"
276- judged_path = f"{ results_dir } /{ frame } _locomo_judged.json"
277-
278- os .makedirs (results_dir , exist_ok = True )
279-
280- load_dotenv ()
281- oai_client = AsyncOpenAI (
282- api_key = os .getenv ("OPENAI_API_KEY" ), base_url = os .getenv ("OPENAI_BASE_URL" )
283- )
284-
285- with open (response_path ) as file :
286- locomo_responses = json .load (file )
287-
288- num_users = 10
289- all_grades = {}
290-
291- total_responses_count = sum (
292- len (locomo_responses .get (f"locomo_exp_user_{ i } " , [])) for i in range (num_users )
293- )
294- print (f"Found { total_responses_count } total responses across { num_users } users to evaluate" )
295-
296- # Create tasks for processing each group
297- tasks = []
298- active_users = 0
299- for group_idx in range (num_users ):
300- group_id = f"locomo_exp_user_{ group_idx } "
301- group_responses = locomo_responses .get (group_id , [])
302- if not group_responses :
303- print (f"No responses found for group { group_id } " )
304- continue
270+ class LocomoEvaluator (LocomoEvalModelModules ):
271+ def __init__ (self , args ):
272+ # Initialize base class to populate self.frame, self.version, etc.
273+ super ().__init__ (args = args )
305274
306- active_users += 1
307- tasks .append (process_single_group (group_id , group_responses , oai_client , options , num_runs ))
275+ self .evaluation_options = getattr (args , "evaluation_options" , ["lexical" , "semantic" ])
276+ self .num_runs = getattr (args , "num_runs" , 1 )
277+ self .max_workers = getattr (args , "workers" , 4 )
308278
309- print (f"Starting evaluation of { active_users } user groups with responses" )
310-
311- semaphore = asyncio .Semaphore (max_workers )
312-
313- async def limited_task (task ):
314- async with semaphore :
315- return await task
316-
317- limited_tasks = [limited_task (task ) for task in tasks ]
318- group_results = await asyncio .gather (* limited_tasks )
319-
320- for group_id , graded_responses in group_results :
321- all_grades [group_id ] = graded_responses
279+ load_dotenv ()
280+ self .oai_client = AsyncOpenAI (
281+ api_key = os .getenv ("OPENAI_API_KEY" ), base_url = os .getenv ("OPENAI_BASE_URL" )
282+ )
322283
323- print ("\n === Evaluation Complete: Calculating final scores ===" )
284+ async def run (self ):
285+ print (
286+ f"\n === Starting LoCoMo evaluation for { self .frame } (version: { self .version } ) with { self .num_runs } run(s) per question ==="
287+ )
288+ print (f"Using { self .max_workers } concurrent workers for processing groups" )
324289
325- run_scores = []
326- evaluated_count = 0
327- if num_runs > 0 :
328- for i in range (1 , num_runs + 1 ):
329- judgment_key = f"judgment_{ i } "
330- current_run_correct_count = 0
331- current_run_total_count = 0
332- for group in all_grades .values ():
333- for response in group :
334- if judgment_key in response ["llm_judgments" ]:
335- if response ["llm_judgments" ][judgment_key ]:
336- current_run_correct_count += 1
337- current_run_total_count += 1
290+ with open (self .response_path ) as file :
291+ locomo_responses = json .load (file )
338292
339- if current_run_total_count > 0 :
340- run_accuracy = current_run_correct_count / current_run_total_count
341- run_scores .append (run_accuracy )
293+ num_users = 10
294+ all_grades = {}
342295
343- evaluated_count = current_run_total_count
296+ total_responses_count = sum (
297+ len (locomo_responses .get (f"locomo_exp_user_{ i } " , [])) for i in range (num_users )
298+ )
299+ print (f"Found { total_responses_count } total responses across { num_users } users to evaluate" )
300+
301+ # Create tasks for processing each group
302+ tasks = []
303+ active_users = 0
304+ for group_idx in range (num_users ):
305+ group_id = f"locomo_exp_user_{ group_idx } "
306+ group_responses = locomo_responses .get (group_id , [])
307+ if not group_responses :
308+ print (f"No responses found for group { group_id } " )
309+ continue
310+
311+ active_users += 1
312+ tasks .append (
313+ process_single_group (
314+ group_id = group_id ,
315+ group_responses = group_responses ,
316+ oai_client = self .oai_client ,
317+ evaluation_options = self .evaluation_options ,
318+ num_runs = self .num_runs ,
319+ )
320+ )
344321
345- if evaluated_count > 0 :
346- mean_of_scores = np .mean (run_scores )
347- std_of_scores = np .std (run_scores )
348- print (f"LLM-as-a-Judge Mean Score: { mean_of_scores :.4f} " )
349- print (f"LLM-as-a-Judge Standard Deviation: { std_of_scores :.4f} " )
350- print (f"(Calculated from { num_runs } separate runs over { evaluated_count } questions)" )
351- print (f"Individual run scores: { [round (s , 4 ) for s in run_scores ]} " )
352- else :
353- print ("No responses were evaluated" )
354- print ("LLM-as-a-Judge score: N/A (0/0)" )
322+ print (f"Starting evaluation of { active_users } user groups with responses" )
323+
324+ semaphore = asyncio .Semaphore (self .max_workers )
325+
326+ async def limited_task (task ):
327+ async with semaphore :
328+ return await task
329+
330+ limited_tasks = [limited_task (task ) for task in tasks ]
331+ group_results = await asyncio .gather (* limited_tasks )
332+
333+ for group_id , graded_responses in group_results :
334+ all_grades [group_id ] = graded_responses
335+
336+ print ("\n === Evaluation Complete: Calculating final scores ===" )
337+
338+ run_scores = []
339+ evaluated_count = 0
340+ if self .num_runs > 0 :
341+ for i in range (1 , self .num_runs + 1 ):
342+ judgment_key = f"judgment_{ i } "
343+ current_run_correct_count = 0
344+ current_run_total_count = 0
345+ for group in all_grades .values ():
346+ for response in group :
347+ if judgment_key in response ["llm_judgments" ]:
348+ if response ["llm_judgments" ][judgment_key ]:
349+ current_run_correct_count += 1
350+ current_run_total_count += 1
351+
352+ if current_run_total_count > 0 :
353+ run_accuracy = current_run_correct_count / current_run_total_count
354+ run_scores .append (run_accuracy )
355+
356+ evaluated_count = current_run_total_count
357+
358+ if evaluated_count > 0 :
359+ mean_of_scores = np .mean (run_scores )
360+ std_of_scores = np .std (run_scores )
361+ print (f"LLM-as-a-Judge Mean Score: { mean_of_scores :.4f} " )
362+ print (f"LLM-as-a-Judge Standard Deviation: { std_of_scores :.4f} " )
363+ print (
364+ f"(Calculated from { self .num_runs } separate runs over { evaluated_count } questions)"
365+ )
366+ print (f"Individual run scores: { [round (s , 4 ) for s in run_scores ]} " )
367+ else :
368+ print ("No responses were evaluated" )
369+ print ("LLM-as-a-Judge score: N/A (0/0)" )
355370
356- all_grades = convert_numpy_types (all_grades )
357- with open (judged_path , "w" ) as f :
358- json .dump (all_grades , f , indent = 2 )
359- print (f"Saved detailed evaluation results to { judged_path } " )
371+ all_grades = convert_numpy_types (all_grades )
372+ with open (self . judged_path , "w" ) as f :
373+ json .dump (all_grades , f , indent = 2 )
374+ print (f"Saved detailed evaluation results to { self . judged_path } " )
360375
361376
362377if __name__ == "__main__" :
@@ -380,10 +395,23 @@ async def limited_task(task):
380395 default = 3 ,
381396 help = "Number of times to run the LLM grader for each question" ,
382397 )
383- parser .add_argument ("--options " , nargs = "+" , default = ["lexical" , "semantic" ])
398+ parser .add_argument ("--evaluation_options " , nargs = "+" , default = ["lexical" , "semantic" ])
384399 parser .add_argument (
385400 "--workers" , type = int , default = 10 , help = "Number of concurrent workers for processing groups"
386401 )
387- args = parser .parse_args ()
388-
389- asyncio .run (main (args .lib , args .version , args .options , args .num_runs , args .workers ))
402+ cli_args = parser .parse_args ()
403+
404+ # Build args for evaluator
405+ class Args :
406+ def __init__ (self , cli_args ):
407+ self .frame = cli_args .lib
408+ self .version = cli_args .version
409+ self .workers = cli_args .workers
410+ self .num_runs = cli_args .num_runs
411+ self .evaluation_options = cli_args .evaluation_options
412+ self .top_k = 20
413+ self .scheduler_flag = True
414+
415+ args = Args (cli_args )
416+ evaluator = LocomoEvaluator (args = args )
417+ asyncio .run (evaluator .run ())
0 commit comments