chore: Update AgentEvaluator to use EvalConfig

ankursharmas · copybara-github · commit 65554d662105 · 2025-10-02T13:43:44.000-07:00
We updated the one of the public methods on AgentEvaluator to take in eval metric configurations using a more formal EvalConfig data model.

We also mark "criteria" field on the method as deprecated.

Updated some integration test cases.

PiperOrigin-RevId: 814314134
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
@@ -16,7 +16,6 @@
 
 import importlib.util
 import inspect
-import json
 import logging
 import os
 import sys
@@ -70,10 +69,6 @@
     RESPONSE_MATCH_SCORE_KEY: 0.8,
 }
 
-_DEFAULT_EVAL_CONFIG = EvalConfig(
-    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
-)
-
 
 def _import_from_path(module_name, file_path):
   spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -89,52 +84,6 @@ def _get_agent_module(agent_module_file_path: str):
   return _import_from_path(module_name, file_path)
 
 
-def get_evaluation_criteria_or_default(
-    eval_config_file_path: str,
-) -> EvalConfig:
-  """Returns EvalConfig read from the config file, if present.
-
-  Otherwise a default one is returned.
-  """
-  if eval_config_file_path:
-    with open(eval_config_file_path, "r", encoding="utf-8") as f:
-      content = f.read()
-      return EvalConfig.model_validate_json(content)
-
-  logger.info("No config file supplied. Using default criteria.")
-  return _DEFAULT_EVAL_CONFIG
-
-
-def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
-  """Returns a list of EvalMetrics mapped from the EvalConfig."""
-  eval_metric_list = []
-  if eval_config.criteria:
-    for metric_name, criterion in eval_config.criteria.items():
-      if isinstance(criterion, float):
-        eval_metric_list.append(
-            EvalMetric(
-                metric_name=metric_name,
-                threshold=criterion,
-                criterion=BaseCriterion(threshold=criterion),
-            )
-        )
-      elif isinstance(criterion, BaseCriterion):
-        eval_metric_list.append(
-            EvalMetric(
-                metric_name=metric_name,
-                threshold=criterion.threshold,
-                criterion=criterion,
-            )
-        )
-      else:
-        raise ValueError(
-            f"Unexpected criterion type. {type(criterion).__name__} not"
-            " supported."
-        )
-
-  return eval_metric_list
-
-
 def get_root_agent(agent_module_file_path: str) -> Agent:
   """Returns root agent given the agent module."""
   agent_module = _get_agent_module(agent_module_file_path)
diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
@@ -524,8 +524,8 @@ def cli_eval(
   try:
     from ..evaluation.base_eval_service import InferenceConfig
     from ..evaluation.base_eval_service import InferenceRequest
-    from ..evaluation.eval_metrics import EvalMetric
-    from ..evaluation.eval_metrics import JudgeModelOptions
+    from ..evaluation.eval_config import get_eval_metrics_from_config
+    from ..evaluation.eval_config import get_evaluation_criteria_or_default
     from ..evaluation.eval_result import EvalCaseResult
     from ..evaluation.evaluator import EvalStatus
     from ..evaluation.in_memory_eval_sets_manager import InMemoryEvalSetsManager
@@ -535,8 +535,6 @@ def cli_eval(
     from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
     from .cli_eval import _collect_eval_results
     from .cli_eval import _collect_inferences
-    from .cli_eval import get_eval_metrics_from_config
-    from .cli_eval import get_evaluation_criteria_or_default
     from .cli_eval import get_root_agent
     from .cli_eval import parse_and_get_evals_to_run
     from .cli_eval import pretty_print_eval_result
diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
@@ -37,6 +37,10 @@
 from .eval_case import get_all_tool_calls
 from .eval_case import IntermediateDataType
 from .eval_case import Invocation
+from .eval_config import EvalConfig
+from .eval_config import get_eval_metrics_from_config
+from .eval_config import get_evaluation_criteria_or_default
+from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalMetric
 from .eval_metrics import EvalMetricResult
 from .eval_metrics import PrebuiltMetrics
@@ -72,12 +76,6 @@
 EXPECTED_TOOL_USE_COLUMN = "expected_tool_use"
 
 
-DEFAULT_CRITERIA = {
-    TOOL_TRAJECTORY_SCORE_KEY: 1.0,  # 1-point scale; 1.0 is perfect.
-    RESPONSE_MATCH_SCORE_KEY: 0.8,  # Rouge-1 text match; 0.8 is default.
-}
-
-
 def load_json(file_path: str) -> Union[Dict, List]:
   with open(file_path, "r") as f:
     return json.load(f)
@@ -99,28 +97,18 @@ class AgentEvaluator:
   """An evaluator for Agents, mainly intended for helping with test cases."""
 
   @staticmethod
-  def find_config_for_test_file(test_file: str):
+  def find_config_for_test_file(test_file: str) -> EvalConfig:
     """Find the test_config.json file in the same folder as the test file."""
     test_folder = os.path.dirname(test_file)
     config_path = os.path.join(test_folder, "test_config.json")
-    if os.path.exists(config_path):
-      config_data = load_json(config_path)
-      if "criteria" in config_data and isinstance(
-          config_data["criteria"], dict
-      ):
-        return config_data["criteria"]
-      else:
-        raise ValueError(
-            f"Invalid format for test_config.json at {config_path}. Expected a"
-            " 'criteria' dictionary."
-        )
-    return DEFAULT_CRITERIA
+    return get_evaluation_criteria_or_default(config_path)
 
   @staticmethod
   async def evaluate_eval_set(
       agent_module: str,
       eval_set: EvalSet,
-      criteria: dict[str, float],
+      criteria: Optional[dict[str, float]] = None,
+      eval_config: Optional[EvalConfig] = None,
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
       print_detailed_results: bool = True,
@@ -133,20 +121,33 @@ async def evaluate_eval_set(
         look for 'root_agent' in the loaded module.
       eval_set: The eval set.
       criteria: Evauation criterias, a dictionary of metric names to their
-        respective thresholds.
+        respective thresholds. This field is deprecated.
+      eval_config: The evauation config.
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent, if trying to evaluate something other
         than root agent. If left empty or none, then root agent is evaluated.
       print_detailed_results: Whether to print detailed results for each metric
         evaluation.
     """
+    if criteria:
+      logger.warning(
+          "`criteria` field is deprecated and will be removed in future"
+          " iterations. For now, we will automatically map values in `criteria`"
+          " to `eval_config`, but you should move to using `eval_config` field."
+      )
+      base_criteria = {
+          k: BaseCriterion(threshold=v) for k, v in criteria.items()
+      }
+      eval_config = EvalConfig(criteria=base_criteria)
+
+    if eval_config is None:
+      raise ValueError("`eval_config` is required.")
+
     agent_for_eval = AgentEvaluator._get_agent_for_eval(
         module_name=agent_module, agent_name=agent_name
     )
-    eval_metrics = [
-        EvalMetric(metric_name=n, threshold=t) for n, t in criteria.items()
-    ]
+    eval_metrics = get_eval_metrics_from_config(eval_config)
 
     # Step 1: Perform evals, basically inferencing and evaluation of metrics
     eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id(
@@ -226,15 +227,15 @@ async def evaluate(
     initial_session = AgentEvaluator._get_initial_session(initial_session_file)
 
     for test_file in test_files:
-      criteria = AgentEvaluator.find_config_for_test_file(test_file)
+      eval_config = AgentEvaluator.find_config_for_test_file(test_file)
       eval_set = AgentEvaluator._load_eval_set_from_file(
-          test_file, criteria, initial_session
+          test_file, eval_config, initial_session
       )
 
       await AgentEvaluator.evaluate_eval_set(
           agent_module=agent_module,
           eval_set=eval_set,
-          criteria=criteria,
+          eval_config=eval_config,
           num_runs=num_runs,
           agent_name=agent_name,
           print_detailed_results=print_detailed_results,
@@ -252,11 +253,11 @@ def migrate_eval_data_to_new_schema(
           "One of old_eval_data_file or new_eval_data_file is empty."
       )
 
-    criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
+    eval_config = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
     initial_session = AgentEvaluator._get_initial_session(initial_session_file)
 
     eval_set = AgentEvaluator._get_eval_set_from_old_format(
-        old_eval_data_file, criteria, initial_session
+        old_eval_data_file, eval_config, initial_session
     )
 
     with open(new_eval_data_file, "w") as f:
@@ -265,7 +266,7 @@ def migrate_eval_data_to_new_schema(
   @staticmethod
   def _load_eval_set_from_file(
       eval_set_file: str,
-      criteria: dict[str, float],
+      eval_config: EvalConfig,
       initial_session: dict[str, Any],
   ) -> EvalSet:
     """Loads an EvalSet from the given file."""
@@ -292,17 +293,17 @@ def _load_eval_set_from_file(
 
     # If we are here, the data must be specified in the older format.
     return AgentEvaluator._get_eval_set_from_old_format(
-        eval_set_file, criteria, initial_session
+        eval_set_file, eval_config, initial_session
     )
 
   @staticmethod
   def _get_eval_set_from_old_format(
       eval_set_file: str,
-      criteria: dict[str, float],
+      eval_config: EvalConfig,
       initial_session: dict[str, Any],
   ) -> EvalSet:
     data = AgentEvaluator._load_dataset(eval_set_file)[0]
-    AgentEvaluator._validate_input([data], criteria)
+    AgentEvaluator._validate_input([data], eval_config.criteria)
     eval_data = {
         "name": eval_set_file,
         "data": data,
diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py
@@ -14,16 +14,21 @@
 
 from __future__ import annotations
 
+import logging
+from typing import Optional
 from typing import Union
 
 from pydantic import alias_generators
 from pydantic import BaseModel
 from pydantic import ConfigDict
 from pydantic import Field
 
+from ..evaluation.eval_metrics import EvalMetric
 from .eval_metrics import BaseCriterion
 from .eval_metrics import Threshold
 
+logger = logging.getLogger("google_adk." + __name__)
+
 
 class EvalConfig(BaseModel):
   """Configurations needed to run an Eval.
@@ -64,3 +69,54 @@ class EvalConfig(BaseModel):
 }
 """,
   )
+
+
+_DEFAULT_EVAL_CONFIG = EvalConfig(
+    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
+)
+
+
+def get_evaluation_criteria_or_default(
+    eval_config_file_path: Optional[str],
+) -> EvalConfig:
+  """Returns EvalConfig read from the config file, if present.
+
+  Otherwise a default one is returned.
+  """
+  if eval_config_file_path:
+    with open(eval_config_file_path, "r", encoding="utf-8") as f:
+      content = f.read()
+      return EvalConfig.model_validate_json(content)
+
+  logger.info("No config file supplied. Using default criteria.")
+  return _DEFAULT_EVAL_CONFIG
+
+
+def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
+  """Returns a list of EvalMetrics mapped from the EvalConfig."""
+  eval_metric_list = []
+  if eval_config.criteria:
+    for metric_name, criterion in eval_config.criteria.items():
+      if isinstance(criterion, float):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion,
+                criterion=BaseCriterion(threshold=criterion),
+            )
+        )
+      elif isinstance(criterion, BaseCriterion):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion.threshold,
+                criterion=criterion,
+            )
+        )
+      else:
+        raise ValueError(
+            f"Unexpected criterion type. {type(criterion).__name__} not"
+            " supported."
+        )
+
+  return eval_metric_list
diff --git a/tests/integration/fixture/hello_world_agent/test_config.json b/tests/integration/fixture/hello_world_agent/test_config.json
@@ -1,7 +1,6 @@
 {
   "criteria": {
     "tool_trajectory_avg_score": 1.0,
-    "response_match_score": 0.5,
-    "safety_v1": 0.8
+    "response_match_score": 0.5
   }
 }
diff --git a/tests/integration/fixture/home_automation_agent/test_config.json b/tests/integration/fixture/home_automation_agent/test_config.json
@@ -1,6 +1,6 @@
 {
   "criteria": {
     "tool_trajectory_avg_score": 1.0,
-    "safety_v1": 0.8
+    "response_match_score": 0.3
   }
 }
diff --git a/tests/integration/fixture/trip_planner_agent/trip_inquiry_multi_turn.test.json b/tests/integration/fixture/trip_planner_agent/trip_inquiry_multi_turn.test.json
diff --git a/tests/integration/test_multi_agent.py b/tests/integration/test_multi_agent.py
diff --git a/tests/unittests/evaluation/test_eval_config.py b/tests/unittests/evaluation/test_eval_config.py

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"criteria": {`
`3`	`3`	`"tool_trajectory_avg_score": 1.0,`
`4`		`- "response_match_score": 0.5,`
`5`		`- "safety_v1": 0.8`
	`4`	`+ "response_match_score": 0.5`
`6`	`5`	`}`
`7`	`6`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"criteria": {`
`3`	`3`	`"tool_trajectory_avg_score": 1.0,`
`4`		`- "safety_v1": 0.8`
	`4`	`+ "response_match_score": 0.3`
`5`	`5`	`}`
`6`	`6`	`}`