Skip to content

Commit 609a235

Browse files
ankursharmascopybara-github
authored andcommitted
chore: PrettyPrint the output of detailed results generated from adk eval cli command
PiperOrigin-RevId: 812912413
1 parent 772658f commit 609a235

File tree

2 files changed

+107
-9
lines changed

2 files changed

+107
-9
lines changed

src/google/adk/cli/cli_eval.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from typing import Optional
2626
import uuid
2727

28+
import click
29+
from google.genai import types as genai_types
2830
from typing_extensions import deprecated
2931

3032
from ..agents.llm_agent import Agent
@@ -37,6 +39,8 @@
3739
from ..evaluation.base_eval_service import InferenceResult
3840
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
3941
from ..evaluation.eval_case import EvalCase
42+
from ..evaluation.eval_case import get_all_tool_calls
43+
from ..evaluation.eval_case import IntermediateDataType
4044
from ..evaluation.eval_config import BaseCriterion
4145
from ..evaluation.eval_config import EvalConfig
4246
from ..evaluation.eval_metrics import EvalMetric
@@ -359,6 +363,106 @@ async def run_evals(
359363
logger.exception("Eval failed for `%s:%s`", eval_set_id, eval_name)
360364

361365

366+
def _convert_content_to_text(
367+
content: Optional[genai_types.Content],
368+
) -> str:
369+
if content and content.parts:
370+
return "\n".join([p.text for p in content.parts if p.text])
371+
return ""
372+
373+
374+
def _convert_tool_calls_to_text(
375+
intermediate_data: Optional[IntermediateDataType],
376+
) -> str:
377+
tool_calls = get_all_tool_calls(intermediate_data)
378+
return "\n".join([str(t) for t in tool_calls])
379+
380+
381+
def pretty_print_eval_result(eval_result: EvalCaseResult):
382+
"""Pretty prints eval result."""
383+
try:
384+
import pandas as pd
385+
from tabulate import tabulate
386+
except ModuleNotFoundError as e:
387+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
388+
389+
click.echo(f"Eval Set Id: {eval_result.eval_set_id}")
390+
click.echo(f"Eval Id: {eval_result.eval_id}")
391+
click.echo(f"Overall Eval Status: {eval_result.final_eval_status.name}")
392+
393+
for metric_result in eval_result.overall_eval_metric_results:
394+
click.echo(
395+
"---------------------------------------------------------------------"
396+
)
397+
click.echo(
398+
f"Metric: {metric_result.metric_name}, "
399+
f"Status: {metric_result.eval_status.name}, "
400+
f"Score: {metric_result.score}, "
401+
f"Threshold: {metric_result.threshold}"
402+
)
403+
if metric_result.details and metric_result.details.rubric_scores:
404+
click.echo("Rubric Scores:")
405+
rubrics_by_id = {
406+
r["rubric_id"]: r["rubric_content"]["text_property"]
407+
for r in metric_result.criterion.rubrics
408+
}
409+
for rubric_score in metric_result.details.rubric_scores:
410+
rubric = rubrics_by_id.get(rubric_score.rubric_id)
411+
click.echo(
412+
f"Rubric: {rubric}, "
413+
f"Score: {rubric_score.score}, "
414+
f"Reasoning: {rubric_score.rationale}"
415+
)
416+
417+
data = []
418+
for per_invocation_result in eval_result.eval_metric_result_per_invocation:
419+
row_data = {
420+
"prompt": _convert_content_to_text(
421+
per_invocation_result.expected_invocation.user_content
422+
),
423+
"expected_response": _convert_content_to_text(
424+
per_invocation_result.expected_invocation.final_response
425+
),
426+
"actual_response": _convert_content_to_text(
427+
per_invocation_result.actual_invocation.final_response
428+
),
429+
"expected_tool_calls": _convert_tool_calls_to_text(
430+
per_invocation_result.expected_invocation.intermediate_data
431+
),
432+
"actual_tool_calls": _convert_tool_calls_to_text(
433+
per_invocation_result.actual_invocation.intermediate_data
434+
),
435+
}
436+
for metric_result in per_invocation_result.eval_metric_results:
437+
row_data[metric_result.metric_name] = (
438+
f"Status: {metric_result.eval_status.name}, "
439+
f"Score: {metric_result.score}"
440+
)
441+
if metric_result.details and metric_result.details.rubric_scores:
442+
rubrics_by_id = {
443+
r["rubric_id"]: r["rubric_content"]["text_property"]
444+
for r in metric_result.criterion.rubrics
445+
}
446+
for rubric_score in metric_result.details.rubric_scores:
447+
rubric = rubrics_by_id.get(rubric_score.rubric_id)
448+
row_data[f"Rubric: {rubric}"] = (
449+
f"Reasoning: {rubric_score.rationale}, "
450+
f"Score: {rubric_score.score}"
451+
)
452+
data.append(row_data)
453+
if data:
454+
click.echo(
455+
"---------------------------------------------------------------------"
456+
)
457+
click.echo("Invocation Details:")
458+
df = pd.DataFrame(data)
459+
for col in df.columns:
460+
if df[col].dtype == "object":
461+
df[col] = df[col].str.wrap(40)
462+
click.echo(tabulate(df, headers="keys", tablefmt="grid"))
463+
click.echo("\n\n") # Few empty lines for visual clarity
464+
465+
362466
def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
363467
try:
364468
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator

src/google/adk/cli/cli_tools_click.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ def cli_eval(
539539
from .cli_eval import get_evaluation_criteria_or_default
540540
from .cli_eval import get_root_agent
541541
from .cli_eval import parse_and_get_evals_to_run
542+
from .cli_eval import pretty_print_eval_result
542543
except ModuleNotFoundError as mnf:
543544
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
544545

@@ -671,16 +672,9 @@ def cli_eval(
671672
for eval_result in eval_results:
672673
eval_result: EvalCaseResult
673674
click.echo(
674-
"*********************************************************************"
675-
)
676-
click.echo(
677-
eval_result.model_dump_json(
678-
indent=2,
679-
exclude_unset=True,
680-
exclude_defaults=True,
681-
exclude_none=True,
682-
)
675+
"********************************************************************"
683676
)
677+
pretty_print_eval_result(eval_result)
684678

685679

686680
def adk_services_options():

0 commit comments

Comments
 (0)