Skip to content

Commit 32663c6

Browse files
authored
Print per evaluator run summary at the end of Evaluate API call (Azure#37859)
* print per evaluator run summary * remove start_time
1 parent bf50e24 commit 32663c6

File tree

3 files changed

+37
-6
lines changed

3 files changed

+37
-6
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,7 @@ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
182182
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
183183
return {}
184184
return aggregated_metrics
185+
186+
def get_run_summary(self, run: CodeRun):
187+
# Not implemented
188+
return None

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
from concurrent.futures import Future
99
from typing import Any, Callable, Dict, Optional, Union
10+
from collections import OrderedDict
1011

1112
import pandas as pd
1213
from promptflow.client import PFClient
@@ -60,6 +61,16 @@ def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
6061
run: Run = proxy_run.run.result()
6162
return self._pf_client.get_metrics(run)
6263

64+
def get_run_summary(self, proxy_run):
65+
run = proxy_run.run.result()
66+
return OrderedDict([
67+
("status", run.status),
68+
("duration", str(run._end_time - run._created_on)),
69+
("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
70+
("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
71+
("log_path", str(run._output_path)),
72+
])
73+
6374
@staticmethod
6475
def _should_batch_use_async(flow):
6576
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import re
77
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
8+
import json
89

910
import pandas as pd
1011
from promptflow._sdk._constants import LINE_NUMBER
@@ -567,6 +568,18 @@ def evaluate(
567568
raise e
568569

569570

571+
def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
572+
# Extract evaluators with a non-empty "run_summary"
573+
output_dict = {name: result["run_summary"]
574+
for name, result in per_evaluator_results.items()
575+
if result.get("run_summary")}
576+
577+
if output_dict:
578+
print("======= Combined Run Summary (Per Evaluator) =======\n")
579+
print(json.dumps(output_dict, indent=4))
580+
print("\n====================================================")
581+
582+
570583
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
571584
*,
572585
evaluators: Dict[str, Callable],
@@ -654,7 +667,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
654667
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
655668
column_mapping["default"][col] = f"${{data.{col}}}"
656669

657-
def get_evaluators_info(
670+
def eval_batch_run(
658671
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
659672
) -> Dict[str, __EvaluatorInfo]:
660673
with BatchRunContext(batch_run_client):
@@ -676,6 +689,7 @@ def get_evaluators_info(
676689
evaluator_name: {
677690
"result": batch_run_client.get_details(run, all_results=True),
678691
"metrics": batch_run_client.get_metrics(run),
692+
"run_summary": batch_run_client.get_run_summary(run),
679693
}
680694
for evaluator_name, run in runs.items()
681695
}
@@ -690,16 +704,16 @@ def get_evaluators_info(
690704
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
691705
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
692706
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
693-
evaluators_info = get_evaluators_info(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
707+
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
694708
else:
695709
data = input_data_df
696-
evaluators_info = get_evaluators_info(CodeClient(), data=input_data_df)
710+
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
697711

698712
# Concatenate all results
699713
evaluators_result_df = None
700714
evaluators_metric = {}
701-
for evaluator_name, evaluator_info in evaluators_info.items():
702-
evaluator_result_df = evaluator_info["result"]
715+
for evaluator_name, evaluator_result in per_evaluator_results.items():
716+
evaluator_result_df = evaluator_result["result"]
703717

704718
# drop input columns
705719
evaluator_result_df = evaluator_result_df.drop(
@@ -722,7 +736,7 @@ def get_evaluators_info(
722736
else evaluator_result_df
723737
)
724738

725-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
739+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
726740

727741
# Rename columns, generated by target function to outputs instead of inputs.
728742
# If target generates columns, already present in the input data, these columns
@@ -745,4 +759,6 @@ def get_evaluators_info(
745759
if output_path:
746760
_write_output(output_path, result)
747761

762+
_print_summary(per_evaluator_results)
763+
748764
return result

0 commit comments

Comments
 (0)