|
25 | 25 | from typing import Optional
|
26 | 26 | import uuid
|
27 | 27 |
|
| 28 | +import click |
| 29 | +from google.genai import types as genai_types |
28 | 30 | from typing_extensions import deprecated
|
29 | 31 |
|
30 | 32 | from ..agents.llm_agent import Agent
|
|
37 | 39 | from ..evaluation.base_eval_service import InferenceResult
|
38 | 40 | from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
39 | 41 | from ..evaluation.eval_case import EvalCase
|
| 42 | +from ..evaluation.eval_case import get_all_tool_calls |
| 43 | +from ..evaluation.eval_case import IntermediateDataType |
40 | 44 | from ..evaluation.eval_config import BaseCriterion
|
41 | 45 | from ..evaluation.eval_config import EvalConfig
|
42 | 46 | from ..evaluation.eval_metrics import EvalMetric
|
@@ -359,6 +363,106 @@ async def run_evals(
|
359 | 363 | logger.exception("Eval failed for `%s:%s`", eval_set_id, eval_name)
|
360 | 364 |
|
361 | 365 |
|
| 366 | +def _convert_content_to_text( |
| 367 | + content: Optional[genai_types.Content], |
| 368 | +) -> str: |
| 369 | + if content and content.parts: |
| 370 | + return "\n".join([p.text for p in content.parts if p.text]) |
| 371 | + return "" |
| 372 | + |
| 373 | + |
| 374 | +def _convert_tool_calls_to_text( |
| 375 | + intermediate_data: Optional[IntermediateDataType], |
| 376 | +) -> str: |
| 377 | + tool_calls = get_all_tool_calls(intermediate_data) |
| 378 | + return "\n".join([str(t) for t in tool_calls]) |
| 379 | + |
| 380 | + |
| 381 | +def pretty_print_eval_result(eval_result: EvalCaseResult): |
| 382 | + """Pretty prints eval result.""" |
| 383 | + try: |
| 384 | + import pandas as pd |
| 385 | + from tabulate import tabulate |
| 386 | + except ModuleNotFoundError as e: |
| 387 | + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e |
| 388 | + |
| 389 | + click.echo(f"Eval Set Id: {eval_result.eval_set_id}") |
| 390 | + click.echo(f"Eval Id: {eval_result.eval_id}") |
| 391 | + click.echo(f"Overall Eval Status: {eval_result.final_eval_status.name}") |
| 392 | + |
| 393 | + for metric_result in eval_result.overall_eval_metric_results: |
| 394 | + click.echo( |
| 395 | + "---------------------------------------------------------------------" |
| 396 | + ) |
| 397 | + click.echo( |
| 398 | + f"Metric: {metric_result.metric_name}, " |
| 399 | + f"Status: {metric_result.eval_status.name}, " |
| 400 | + f"Score: {metric_result.score}, " |
| 401 | + f"Threshold: {metric_result.threshold}" |
| 402 | + ) |
| 403 | + if metric_result.details and metric_result.details.rubric_scores: |
| 404 | + click.echo("Rubric Scores:") |
| 405 | + rubrics_by_id = { |
| 406 | + r["rubric_id"]: r["rubric_content"]["text_property"] |
| 407 | + for r in metric_result.criterion.rubrics |
| 408 | + } |
| 409 | + for rubric_score in metric_result.details.rubric_scores: |
| 410 | + rubric = rubrics_by_id.get(rubric_score.rubric_id) |
| 411 | + click.echo( |
| 412 | + f"Rubric: {rubric}, " |
| 413 | + f"Score: {rubric_score.score}, " |
| 414 | + f"Reasoning: {rubric_score.rationale}" |
| 415 | + ) |
| 416 | + |
| 417 | + data = [] |
| 418 | + for per_invocation_result in eval_result.eval_metric_result_per_invocation: |
| 419 | + row_data = { |
| 420 | + "prompt": _convert_content_to_text( |
| 421 | + per_invocation_result.expected_invocation.user_content |
| 422 | + ), |
| 423 | + "expected_response": _convert_content_to_text( |
| 424 | + per_invocation_result.expected_invocation.final_response |
| 425 | + ), |
| 426 | + "actual_response": _convert_content_to_text( |
| 427 | + per_invocation_result.actual_invocation.final_response |
| 428 | + ), |
| 429 | + "expected_tool_calls": _convert_tool_calls_to_text( |
| 430 | + per_invocation_result.expected_invocation.intermediate_data |
| 431 | + ), |
| 432 | + "actual_tool_calls": _convert_tool_calls_to_text( |
| 433 | + per_invocation_result.actual_invocation.intermediate_data |
| 434 | + ), |
| 435 | + } |
| 436 | + for metric_result in per_invocation_result.eval_metric_results: |
| 437 | + row_data[metric_result.metric_name] = ( |
| 438 | + f"Status: {metric_result.eval_status.name}, " |
| 439 | + f"Score: {metric_result.score}" |
| 440 | + ) |
| 441 | + if metric_result.details and metric_result.details.rubric_scores: |
| 442 | + rubrics_by_id = { |
| 443 | + r["rubric_id"]: r["rubric_content"]["text_property"] |
| 444 | + for r in metric_result.criterion.rubrics |
| 445 | + } |
| 446 | + for rubric_score in metric_result.details.rubric_scores: |
| 447 | + rubric = rubrics_by_id.get(rubric_score.rubric_id) |
| 448 | + row_data[f"Rubric: {rubric}"] = ( |
| 449 | + f"Reasoning: {rubric_score.rationale}, " |
| 450 | + f"Score: {rubric_score.score}" |
| 451 | + ) |
| 452 | + data.append(row_data) |
| 453 | + if data: |
| 454 | + click.echo( |
| 455 | + "---------------------------------------------------------------------" |
| 456 | + ) |
| 457 | + click.echo("Invocation Details:") |
| 458 | + df = pd.DataFrame(data) |
| 459 | + for col in df.columns: |
| 460 | + if df[col].dtype == "object": |
| 461 | + df[col] = df[col].str.wrap(40) |
| 462 | + click.echo(tabulate(df, headers="keys", tablefmt="grid")) |
| 463 | + click.echo("\n\n") # Few empty lines for visual clarity |
| 464 | + |
| 465 | + |
362 | 466 | def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
|
363 | 467 | try:
|
364 | 468 | from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
|
|
0 commit comments