55import os
66import re
77from typing import Any , Callable , Dict , List , Optional , Set , Tuple , TypedDict , TypeVar , Union
8+ import json
89
910import pandas as pd
1011from promptflow ._sdk ._constants import LINE_NUMBER
@@ -567,6 +568,18 @@ def evaluate(
567568 raise e
568569
569570
571+ def _print_summary (per_evaluator_results : Dict [str , Any ]) -> None :
572+ # Extract evaluators with a non-empty "run_summary"
573+ output_dict = {name : result ["run_summary" ]
574+ for name , result in per_evaluator_results .items ()
575+ if result .get ("run_summary" )}
576+
577+ if output_dict :
578+ print ("======= Combined Run Summary (Per Evaluator) =======\n " )
579+ print (json .dumps (output_dict , indent = 4 ))
580+ print ("\n ====================================================" )
581+
582+
570583def _evaluate ( # pylint: disable=too-many-locals,too-many-statements
571584 * ,
572585 evaluators : Dict [str , Callable ],
@@ -654,7 +667,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
654667 if not col .startswith (Prefixes .TSG_OUTPUTS ) and col not in column_mapping ["default" ].keys ():
655668 column_mapping ["default" ][col ] = f"${{data.{ col } }}"
656669
657- def get_evaluators_info (
670+ def eval_batch_run (
658671 batch_run_client : TClient , * , data = Union [str , os .PathLike , pd .DataFrame ]
659672 ) -> Dict [str , __EvaluatorInfo ]:
660673 with BatchRunContext (batch_run_client ):
@@ -676,6 +689,7 @@ def get_evaluators_info(
676689 evaluator_name : {
677690 "result" : batch_run_client .get_details (run , all_results = True ),
678691 "metrics" : batch_run_client .get_metrics (run ),
692+ "run_summary" : batch_run_client .get_run_summary (run ),
679693 }
680694 for evaluator_name , run in runs .items ()
681695 }
@@ -690,16 +704,16 @@ def get_evaluators_info(
690704 # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
691705 # The root cause is still unclear, but it seems related to a conflict between the async run uploader
692706 # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
693- evaluators_info = get_evaluators_info (ProxyClient (PFClient (user_agent = USER_AGENT )), data = data )
707+ per_evaluator_results = eval_batch_run (ProxyClient (PFClient (user_agent = USER_AGENT )), data = data )
694708 else :
695709 data = input_data_df
696- evaluators_info = get_evaluators_info (CodeClient (), data = input_data_df )
710+ per_evaluator_results = eval_batch_run (CodeClient (), data = input_data_df )
697711
698712 # Concatenate all results
699713 evaluators_result_df = None
700714 evaluators_metric = {}
701- for evaluator_name , evaluator_info in evaluators_info .items ():
702- evaluator_result_df = evaluator_info ["result" ]
715+ for evaluator_name , evaluator_result in per_evaluator_results .items ():
716+ evaluator_result_df = evaluator_result ["result" ]
703717
704718 # drop input columns
705719 evaluator_result_df = evaluator_result_df .drop (
@@ -722,7 +736,7 @@ def get_evaluators_info(
722736 else evaluator_result_df
723737 )
724738
725- evaluators_metric .update ({f"{ evaluator_name } .{ k } " : v for k , v in evaluator_info ["metrics" ].items ()})
739+ evaluators_metric .update ({f"{ evaluator_name } .{ k } " : v for k , v in evaluator_result ["metrics" ].items ()})
726740
727741 # Rename columns, generated by target function to outputs instead of inputs.
728742 # If target generates columns, already present in the input data, these columns
@@ -745,4 +759,6 @@ def get_evaluators_info(
745759 if output_path :
746760 _write_output (output_path , result )
747761
762+ _print_summary (per_evaluator_results )
763+
748764 return result
0 commit comments