@@ -294,6 +294,7 @@ def generate_responses_for_dataset(
294294def score_responses (
295295 handler : TaskHandler ,
296296 id_to_results : Dict [str , Dict [str , Any ]],
297+ * ,
297298 max_workers : int = 32 ,
298299) -> Tuple [float , Dict [str , List [int ]], int ]:
299300 """Computes correctness for model responses for the given task
@@ -341,7 +342,7 @@ def score_responses(
341342 # TODO (sumanthrh): this can be improved
342343 if unique_id not in id_to_scores :
343344 id_to_scores [unique_id ] = [0 for _ in range (N )]
344- id_to_scores [unique_id ][i ] = new_response_entry ["correctness" ]
345+ id_to_scores [unique_id ][i ] = int ( new_response_entry ["correctness" ])
345346
346347 total_correct += new_response_entry ["correctness" ]
347348 total_finish += 1
@@ -350,6 +351,41 @@ def score_responses(
350351 return accuracy , id_to_scores , total_finish
351352
352353
354+ def score_responses_for_indices (
355+ handler : TaskHandler ,
356+ id_to_results : Dict [str , Dict [str , Any ]],
357+ * ,
358+ indices : List [str ],
359+ ) -> List [int ]:
360+ """Computes correctness for model responses for the given task for the unique index `idx`.
361+
362+ The 'id_to_results' dictionary is assumed to be a mapping between problem ID -> { responses: [...], ... },
363+ This is updated in-place.
364+
365+ Returns:
366+ - list of scores
367+ """
368+ if not id_to_results :
369+ return []
370+ logger .info (f"Computing scores for { len (indices )} samples" )
371+ for idx in indices :
372+ # Figure out how many generations per problem
373+ N = len (next (iter (id_to_results .values ()))["responses" ])
374+ record = id_to_results [idx ]
375+ scores = []
376+ for i in range (N ):
377+ content = record ["responses" ][i ]["content" ]
378+ response_entry = handler .update_results (record , content )
379+
380+ # Update correctness and reason in the original results dict
381+ id_to_results [idx ]["responses" ][i ]["correctness" ] = response_entry [
382+ "correctness"
383+ ]
384+ id_to_results [idx ]["responses" ][i ]["reason" ] = response_entry ["reason" ]
385+ scores .append (response_entry ["correctness" ])
386+ return scores
387+
388+
353389def generate_and_score (
354390 handler : TaskHandler ,
355391 model_config : ModelConfig ,
@@ -480,17 +516,31 @@ def generate_and_save(
480516
481517
482518def score_results (
483- handler : TaskHandler , run_dir : Path , run_summary : SummaryResults
519+ handler : TaskHandler ,
520+ run_dir : Path ,
521+ run_summary : SummaryResults ,
522+ indices : Optional [List [str ]] = None ,
484523) -> None :
485524 # load existing results
486525 result_file = run_dir / RESULTS_FILENAME
487526 summary_file = run_dir / SUMMARY_FILENAME
488527 id_to_results = load_existing_results (result_file )
489528 logger .info (f"Loaded { len (id_to_results )} existing results for scoring." )
490529
491- accuracy , id_to_scores , total_finish = score_responses (handler , id_to_results )
492-
493- logger .info (f"Accuracy: { accuracy } " )
530+ if not indices :
531+ accuracy , id_to_scores , total_finish = score_responses (handler , id_to_results )
532+ else :
533+ N = len (next (iter (id_to_results .values ()))["responses" ])
534+ score_responses_for_indices (handler , id_to_results , indices = indices )
535+ id_to_scores = {
536+ index : [
537+ id_to_results [index ]["responses" ][i ]["correctness" ] for i in range (N )
538+ ]
539+ for index in id_to_results
540+ }
541+ accuracy = round (
542+ sum (map (sum , id_to_scores .values ())) / (len (id_to_scores ) * N ), 4
543+ )
494544
495545 sample_count = 0
496546 if id_to_results :
@@ -501,7 +551,9 @@ def score_results(
501551
502552 run_summary .accuracy = accuracy
503553 run_summary .pass_at_k = pass_at_k_metrics
554+
555+ logger .info (f"Accuracy: { accuracy } " )
504556 save_summary (summary_file , run_summary )
505557
506558 save_results (result_file , id_to_results )
507- logger .info (f"Re-scored results saved to { result_file } " )
559+ logger .info (f"Scored results saved to { result_file } " )
0 commit comments