Skip to content

Commit 65554d6

Browse files
ankursharmascopybara-github
authored andcommitted
chore: Update AgentEvaluator to use EvalConfig
We updated the one of the public methods on AgentEvaluator to take in eval metric configurations using a more formal EvalConfig data model. We also mark "criteria" field on the method as deprecated. Updated some integration test cases. PiperOrigin-RevId: 814314134
1 parent e680063 commit 65554d6

File tree

9 files changed

+214
-95
lines changed

9 files changed

+214
-95
lines changed

src/google/adk/cli/cli_eval.py

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
import importlib.util
1818
import inspect
19-
import json
2019
import logging
2120
import os
2221
import sys
@@ -70,10 +69,6 @@
7069
RESPONSE_MATCH_SCORE_KEY: 0.8,
7170
}
7271

73-
_DEFAULT_EVAL_CONFIG = EvalConfig(
74-
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
75-
)
76-
7772

7873
def _import_from_path(module_name, file_path):
7974
spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -89,52 +84,6 @@ def _get_agent_module(agent_module_file_path: str):
8984
return _import_from_path(module_name, file_path)
9085

9186

92-
def get_evaluation_criteria_or_default(
93-
eval_config_file_path: str,
94-
) -> EvalConfig:
95-
"""Returns EvalConfig read from the config file, if present.
96-
97-
Otherwise a default one is returned.
98-
"""
99-
if eval_config_file_path:
100-
with open(eval_config_file_path, "r", encoding="utf-8") as f:
101-
content = f.read()
102-
return EvalConfig.model_validate_json(content)
103-
104-
logger.info("No config file supplied. Using default criteria.")
105-
return _DEFAULT_EVAL_CONFIG
106-
107-
108-
def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
109-
"""Returns a list of EvalMetrics mapped from the EvalConfig."""
110-
eval_metric_list = []
111-
if eval_config.criteria:
112-
for metric_name, criterion in eval_config.criteria.items():
113-
if isinstance(criterion, float):
114-
eval_metric_list.append(
115-
EvalMetric(
116-
metric_name=metric_name,
117-
threshold=criterion,
118-
criterion=BaseCriterion(threshold=criterion),
119-
)
120-
)
121-
elif isinstance(criterion, BaseCriterion):
122-
eval_metric_list.append(
123-
EvalMetric(
124-
metric_name=metric_name,
125-
threshold=criterion.threshold,
126-
criterion=criterion,
127-
)
128-
)
129-
else:
130-
raise ValueError(
131-
f"Unexpected criterion type. {type(criterion).__name__} not"
132-
" supported."
133-
)
134-
135-
return eval_metric_list
136-
137-
13887
def get_root_agent(agent_module_file_path: str) -> Agent:
13988
"""Returns root agent given the agent module."""
14089
agent_module = _get_agent_module(agent_module_file_path)

src/google/adk/cli/cli_tools_click.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -524,8 +524,8 @@ def cli_eval(
524524
try:
525525
from ..evaluation.base_eval_service import InferenceConfig
526526
from ..evaluation.base_eval_service import InferenceRequest
527-
from ..evaluation.eval_metrics import EvalMetric
528-
from ..evaluation.eval_metrics import JudgeModelOptions
527+
from ..evaluation.eval_config import get_eval_metrics_from_config
528+
from ..evaluation.eval_config import get_evaluation_criteria_or_default
529529
from ..evaluation.eval_result import EvalCaseResult
530530
from ..evaluation.evaluator import EvalStatus
531531
from ..evaluation.in_memory_eval_sets_manager import InMemoryEvalSetsManager
@@ -535,8 +535,6 @@ def cli_eval(
535535
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
536536
from .cli_eval import _collect_eval_results
537537
from .cli_eval import _collect_inferences
538-
from .cli_eval import get_eval_metrics_from_config
539-
from .cli_eval import get_evaluation_criteria_or_default
540538
from .cli_eval import get_root_agent
541539
from .cli_eval import parse_and_get_evals_to_run
542540
from .cli_eval import pretty_print_eval_result

src/google/adk/evaluation/agent_evaluator.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
from .eval_case import get_all_tool_calls
3838
from .eval_case import IntermediateDataType
3939
from .eval_case import Invocation
40+
from .eval_config import EvalConfig
41+
from .eval_config import get_eval_metrics_from_config
42+
from .eval_config import get_evaluation_criteria_or_default
43+
from .eval_metrics import BaseCriterion
4044
from .eval_metrics import EvalMetric
4145
from .eval_metrics import EvalMetricResult
4246
from .eval_metrics import PrebuiltMetrics
@@ -72,12 +76,6 @@
7276
EXPECTED_TOOL_USE_COLUMN = "expected_tool_use"
7377

7478

75-
DEFAULT_CRITERIA = {
76-
TOOL_TRAJECTORY_SCORE_KEY: 1.0, # 1-point scale; 1.0 is perfect.
77-
RESPONSE_MATCH_SCORE_KEY: 0.8, # Rouge-1 text match; 0.8 is default.
78-
}
79-
80-
8179
def load_json(file_path: str) -> Union[Dict, List]:
8280
with open(file_path, "r") as f:
8381
return json.load(f)
@@ -99,28 +97,18 @@ class AgentEvaluator:
9997
"""An evaluator for Agents, mainly intended for helping with test cases."""
10098

10199
@staticmethod
102-
def find_config_for_test_file(test_file: str):
100+
def find_config_for_test_file(test_file: str) -> EvalConfig:
103101
"""Find the test_config.json file in the same folder as the test file."""
104102
test_folder = os.path.dirname(test_file)
105103
config_path = os.path.join(test_folder, "test_config.json")
106-
if os.path.exists(config_path):
107-
config_data = load_json(config_path)
108-
if "criteria" in config_data and isinstance(
109-
config_data["criteria"], dict
110-
):
111-
return config_data["criteria"]
112-
else:
113-
raise ValueError(
114-
f"Invalid format for test_config.json at {config_path}. Expected a"
115-
" 'criteria' dictionary."
116-
)
117-
return DEFAULT_CRITERIA
104+
return get_evaluation_criteria_or_default(config_path)
118105

119106
@staticmethod
120107
async def evaluate_eval_set(
121108
agent_module: str,
122109
eval_set: EvalSet,
123-
criteria: dict[str, float],
110+
criteria: Optional[dict[str, float]] = None,
111+
eval_config: Optional[EvalConfig] = None,
124112
num_runs: int = NUM_RUNS,
125113
agent_name: Optional[str] = None,
126114
print_detailed_results: bool = True,
@@ -133,20 +121,33 @@ async def evaluate_eval_set(
133121
look for 'root_agent' in the loaded module.
134122
eval_set: The eval set.
135123
criteria: Evauation criterias, a dictionary of metric names to their
136-
respective thresholds.
124+
respective thresholds. This field is deprecated.
125+
eval_config: The evauation config.
137126
num_runs: Number of times all entries in the eval dataset should be
138127
assessed.
139128
agent_name: The name of the agent, if trying to evaluate something other
140129
than root agent. If left empty or none, then root agent is evaluated.
141130
print_detailed_results: Whether to print detailed results for each metric
142131
evaluation.
143132
"""
133+
if criteria:
134+
logger.warning(
135+
"`criteria` field is deprecated and will be removed in future"
136+
" iterations. For now, we will automatically map values in `criteria`"
137+
" to `eval_config`, but you should move to using `eval_config` field."
138+
)
139+
base_criteria = {
140+
k: BaseCriterion(threshold=v) for k, v in criteria.items()
141+
}
142+
eval_config = EvalConfig(criteria=base_criteria)
143+
144+
if eval_config is None:
145+
raise ValueError("`eval_config` is required.")
146+
144147
agent_for_eval = AgentEvaluator._get_agent_for_eval(
145148
module_name=agent_module, agent_name=agent_name
146149
)
147-
eval_metrics = [
148-
EvalMetric(metric_name=n, threshold=t) for n, t in criteria.items()
149-
]
150+
eval_metrics = get_eval_metrics_from_config(eval_config)
150151

151152
# Step 1: Perform evals, basically inferencing and evaluation of metrics
152153
eval_results_by_eval_id = await AgentEvaluator._get_eval_results_by_eval_id(
@@ -226,15 +227,15 @@ async def evaluate(
226227
initial_session = AgentEvaluator._get_initial_session(initial_session_file)
227228

228229
for test_file in test_files:
229-
criteria = AgentEvaluator.find_config_for_test_file(test_file)
230+
eval_config = AgentEvaluator.find_config_for_test_file(test_file)
230231
eval_set = AgentEvaluator._load_eval_set_from_file(
231-
test_file, criteria, initial_session
232+
test_file, eval_config, initial_session
232233
)
233234

234235
await AgentEvaluator.evaluate_eval_set(
235236
agent_module=agent_module,
236237
eval_set=eval_set,
237-
criteria=criteria,
238+
eval_config=eval_config,
238239
num_runs=num_runs,
239240
agent_name=agent_name,
240241
print_detailed_results=print_detailed_results,
@@ -252,11 +253,11 @@ def migrate_eval_data_to_new_schema(
252253
"One of old_eval_data_file or new_eval_data_file is empty."
253254
)
254255

255-
criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
256+
eval_config = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
256257
initial_session = AgentEvaluator._get_initial_session(initial_session_file)
257258

258259
eval_set = AgentEvaluator._get_eval_set_from_old_format(
259-
old_eval_data_file, criteria, initial_session
260+
old_eval_data_file, eval_config, initial_session
260261
)
261262

262263
with open(new_eval_data_file, "w") as f:
@@ -265,7 +266,7 @@ def migrate_eval_data_to_new_schema(
265266
@staticmethod
266267
def _load_eval_set_from_file(
267268
eval_set_file: str,
268-
criteria: dict[str, float],
269+
eval_config: EvalConfig,
269270
initial_session: dict[str, Any],
270271
) -> EvalSet:
271272
"""Loads an EvalSet from the given file."""
@@ -292,17 +293,17 @@ def _load_eval_set_from_file(
292293

293294
# If we are here, the data must be specified in the older format.
294295
return AgentEvaluator._get_eval_set_from_old_format(
295-
eval_set_file, criteria, initial_session
296+
eval_set_file, eval_config, initial_session
296297
)
297298

298299
@staticmethod
299300
def _get_eval_set_from_old_format(
300301
eval_set_file: str,
301-
criteria: dict[str, float],
302+
eval_config: EvalConfig,
302303
initial_session: dict[str, Any],
303304
) -> EvalSet:
304305
data = AgentEvaluator._load_dataset(eval_set_file)[0]
305-
AgentEvaluator._validate_input([data], criteria)
306+
AgentEvaluator._validate_input([data], eval_config.criteria)
306307
eval_data = {
307308
"name": eval_set_file,
308309
"data": data,

src/google/adk/evaluation/eval_config.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,21 @@
1414

1515
from __future__ import annotations
1616

17+
import logging
18+
from typing import Optional
1719
from typing import Union
1820

1921
from pydantic import alias_generators
2022
from pydantic import BaseModel
2123
from pydantic import ConfigDict
2224
from pydantic import Field
2325

26+
from ..evaluation.eval_metrics import EvalMetric
2427
from .eval_metrics import BaseCriterion
2528
from .eval_metrics import Threshold
2629

30+
logger = logging.getLogger("google_adk." + __name__)
31+
2732

2833
class EvalConfig(BaseModel):
2934
"""Configurations needed to run an Eval.
@@ -64,3 +69,54 @@ class EvalConfig(BaseModel):
6469
}
6570
""",
6671
)
72+
73+
74+
_DEFAULT_EVAL_CONFIG = EvalConfig(
75+
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
76+
)
77+
78+
79+
def get_evaluation_criteria_or_default(
80+
eval_config_file_path: Optional[str],
81+
) -> EvalConfig:
82+
"""Returns EvalConfig read from the config file, if present.
83+
84+
Otherwise a default one is returned.
85+
"""
86+
if eval_config_file_path:
87+
with open(eval_config_file_path, "r", encoding="utf-8") as f:
88+
content = f.read()
89+
return EvalConfig.model_validate_json(content)
90+
91+
logger.info("No config file supplied. Using default criteria.")
92+
return _DEFAULT_EVAL_CONFIG
93+
94+
95+
def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
96+
"""Returns a list of EvalMetrics mapped from the EvalConfig."""
97+
eval_metric_list = []
98+
if eval_config.criteria:
99+
for metric_name, criterion in eval_config.criteria.items():
100+
if isinstance(criterion, float):
101+
eval_metric_list.append(
102+
EvalMetric(
103+
metric_name=metric_name,
104+
threshold=criterion,
105+
criterion=BaseCriterion(threshold=criterion),
106+
)
107+
)
108+
elif isinstance(criterion, BaseCriterion):
109+
eval_metric_list.append(
110+
EvalMetric(
111+
metric_name=metric_name,
112+
threshold=criterion.threshold,
113+
criterion=criterion,
114+
)
115+
)
116+
else:
117+
raise ValueError(
118+
f"Unexpected criterion type. {type(criterion).__name__} not"
119+
" supported."
120+
)
121+
122+
return eval_metric_list
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
{
22
"criteria": {
33
"tool_trajectory_avg_score": 1.0,
4-
"response_match_score": 0.5,
5-
"safety_v1": 0.8
4+
"response_match_score": 0.5
65
}
76
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"criteria": {
33
"tool_trajectory_avg_score": 1.0,
4-
"safety_v1": 0.8
4+
"response_match_score": 0.3
55
}
66
}

0 commit comments

Comments
 (0)