chore: PrettyPrint the output of detailed results generated from adk eval cli command

ankursharmas · copybara-github · commit 609a2358eb89 · 2025-09-29T13:09:31.000-07:00
PiperOrigin-RevId: 812912413
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
@@ -25,6 +25,8 @@
 from typing import Optional
 import uuid
 
+import click
+from google.genai import types as genai_types
 from typing_extensions import deprecated
 
 from ..agents.llm_agent import Agent
@@ -37,6 +39,8 @@
 from ..evaluation.base_eval_service import InferenceResult
 from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
+from ..evaluation.eval_case import get_all_tool_calls
+from ..evaluation.eval_case import IntermediateDataType
 from ..evaluation.eval_config import BaseCriterion
 from ..evaluation.eval_config import EvalConfig
 from ..evaluation.eval_metrics import EvalMetric
@@ -359,6 +363,106 @@ async def run_evals(
         logger.exception("Eval failed for `%s:%s`", eval_set_id, eval_name)
 
 
+def _convert_content_to_text(
+    content: Optional[genai_types.Content],
+) -> str:
+  if content and content.parts:
+    return "\n".join([p.text for p in content.parts if p.text])
+  return ""
+
+
+def _convert_tool_calls_to_text(
+    intermediate_data: Optional[IntermediateDataType],
+) -> str:
+  tool_calls = get_all_tool_calls(intermediate_data)
+  return "\n".join([str(t) for t in tool_calls])
+
+
+def pretty_print_eval_result(eval_result: EvalCaseResult):
+  """Pretty prints eval result."""
+  try:
+    import pandas as pd
+    from tabulate import tabulate
+  except ModuleNotFoundError as e:
+    raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+
+  click.echo(f"Eval Set Id: {eval_result.eval_set_id}")
+  click.echo(f"Eval Id: {eval_result.eval_id}")
+  click.echo(f"Overall Eval Status: {eval_result.final_eval_status.name}")
+
+  for metric_result in eval_result.overall_eval_metric_results:
+    click.echo(
+        "---------------------------------------------------------------------"
+    )
+    click.echo(
+        f"Metric: {metric_result.metric_name}, "
+        f"Status: {metric_result.eval_status.name}, "
+        f"Score: {metric_result.score}, "
+        f"Threshold: {metric_result.threshold}"
+    )
+    if metric_result.details and metric_result.details.rubric_scores:
+      click.echo("Rubric Scores:")
+      rubrics_by_id = {
+          r["rubric_id"]: r["rubric_content"]["text_property"]
+          for r in metric_result.criterion.rubrics
+      }
+      for rubric_score in metric_result.details.rubric_scores:
+        rubric = rubrics_by_id.get(rubric_score.rubric_id)
+        click.echo(
+            f"Rubric: {rubric}, "
+            f"Score: {rubric_score.score}, "
+            f"Reasoning: {rubric_score.rationale}"
+        )
+
+  data = []
+  for per_invocation_result in eval_result.eval_metric_result_per_invocation:
+    row_data = {
+        "prompt": _convert_content_to_text(
+            per_invocation_result.expected_invocation.user_content
+        ),
+        "expected_response": _convert_content_to_text(
+            per_invocation_result.expected_invocation.final_response
+        ),
+        "actual_response": _convert_content_to_text(
+            per_invocation_result.actual_invocation.final_response
+        ),
+        "expected_tool_calls": _convert_tool_calls_to_text(
+            per_invocation_result.expected_invocation.intermediate_data
+        ),
+        "actual_tool_calls": _convert_tool_calls_to_text(
+            per_invocation_result.actual_invocation.intermediate_data
+        ),
+    }
+    for metric_result in per_invocation_result.eval_metric_results:
+      row_data[metric_result.metric_name] = (
+          f"Status: {metric_result.eval_status.name}, "
+          f"Score: {metric_result.score}"
+      )
+      if metric_result.details and metric_result.details.rubric_scores:
+        rubrics_by_id = {
+            r["rubric_id"]: r["rubric_content"]["text_property"]
+            for r in metric_result.criterion.rubrics
+        }
+        for rubric_score in metric_result.details.rubric_scores:
+          rubric = rubrics_by_id.get(rubric_score.rubric_id)
+          row_data[f"Rubric: {rubric}"] = (
+              f"Reasoning: {rubric_score.rationale}, "
+              f"Score: {rubric_score.score}"
+          )
+    data.append(row_data)
+  if data:
+    click.echo(
+        "---------------------------------------------------------------------"
+    )
+    click.echo("Invocation Details:")
+    df = pd.DataFrame(data)
+    for col in df.columns:
+      if df[col].dtype == "object":
+        df[col] = df[col].str.wrap(40)
+    click.echo(tabulate(df, headers="keys", tablefmt="grid"))
+  click.echo("\n\n")  # Few empty lines for visual clarity
+
+
 def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
   try:
     from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
@@ -539,6 +539,7 @@ def cli_eval(
     from .cli_eval import get_evaluation_criteria_or_default
     from .cli_eval import get_root_agent
     from .cli_eval import parse_and_get_evals_to_run
+    from .cli_eval import pretty_print_eval_result
   except ModuleNotFoundError as mnf:
     raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
 
@@ -671,16 +672,9 @@ def cli_eval(
     for eval_result in eval_results:
       eval_result: EvalCaseResult
       click.echo(
-          "*********************************************************************"
-      )
-      click.echo(
-          eval_result.model_dump_json(
-              indent=2,
-              exclude_unset=True,
-              exclude_defaults=True,
-              exclude_none=True,
-          )
+          "********************************************************************"
       )
+      pretty_print_eval_result(eval_result)
 
 
 def adk_services_options():