Print per evaluator run summary at the end of Evaluate API call (Azure#37859)

ninghu · web-flow · commit 32663c69b048 · 2024-10-11T14:36:13.000-07:00
* print per evaluator run summary

* remove start_time
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py
@@ -182,3 +182,7 @@ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
             LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
             return {}
         return aggregated_metrics
+
+    def get_run_summary(self, run: CodeRun):
+        # Not implemented
+        return None
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py
@@ -7,6 +7,7 @@
 import os
 from concurrent.futures import Future
 from typing import Any, Callable, Dict, Optional, Union
+from collections import OrderedDict
 
 import pandas as pd
 from promptflow.client import PFClient
@@ -60,6 +61,16 @@ def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
         run: Run = proxy_run.run.result()
         return self._pf_client.get_metrics(run)
 
+    def get_run_summary(self, proxy_run):
+        run = proxy_run.run.result()
+        return OrderedDict([
+            ("status", run.status),
+            ("duration", str(run._end_time - run._created_on)),
+            ("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
+            ("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
+            ("log_path", str(run._output_path)),
+        ])
+
     @staticmethod
     def _should_batch_use_async(flow):
         if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -5,6 +5,7 @@
 import os
 import re
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+import json
 
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
@@ -567,6 +568,18 @@ def evaluate(
         raise e
 
 
+def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
+    # Extract evaluators with a non-empty "run_summary"
+    output_dict = {name: result["run_summary"]
+                   for name, result in per_evaluator_results.items()
+                   if result.get("run_summary")}
+
+    if output_dict:
+        print("======= Combined Run Summary (Per Evaluator) =======\n")
+        print(json.dumps(output_dict, indent=4))
+        print("\n====================================================")
+
+
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluators: Dict[str, Callable],
@@ -654,7 +667,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
             if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
                 column_mapping["default"][col] = f"${{data.{col}}}"
 
-    def get_evaluators_info(
+    def eval_batch_run(
         batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
     ) -> Dict[str, __EvaluatorInfo]:
         with BatchRunContext(batch_run_client):
@@ -676,6 +689,7 @@ def get_evaluators_info(
                 evaluator_name: {
                     "result": batch_run_client.get_details(run, all_results=True),
                     "metrics": batch_run_client.get_metrics(run),
+                    "run_summary": batch_run_client.get_run_summary(run),
                 }
                 for evaluator_name, run in runs.items()
             }
@@ -690,16 +704,16 @@ def get_evaluators_info(
         # A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
         # The root cause is still unclear, but it seems related to a conflict between the async run uploader
         # and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
-        evaluators_info = get_evaluators_info(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
+        per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
     else:
         data = input_data_df
-        evaluators_info = get_evaluators_info(CodeClient(), data=input_data_df)
+        per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
 
     # Concatenate all results
     evaluators_result_df = None
     evaluators_metric = {}
-    for evaluator_name, evaluator_info in evaluators_info.items():
-        evaluator_result_df = evaluator_info["result"]
+    for evaluator_name, evaluator_result in per_evaluator_results.items():
+        evaluator_result_df = evaluator_result["result"]
 
         # drop input columns
         evaluator_result_df = evaluator_result_df.drop(
@@ -722,7 +736,7 @@ def get_evaluators_info(
             else evaluator_result_df
         )
 
-        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
+        evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
 
     # Rename columns, generated by target function to outputs instead of inputs.
     # If target generates columns, already present in the input data, these columns
@@ -745,4 +759,6 @@ def get_evaluators_info(
     if output_path:
         _write_output(output_path, result)
 
+    _print_summary(per_evaluator_results)
+
     return result