37
37
from .eval_case import get_all_tool_calls
38
38
from .eval_case import IntermediateDataType
39
39
from .eval_case import Invocation
40
+ from .eval_config import EvalConfig
41
+ from .eval_config import get_eval_metrics_from_config
42
+ from .eval_config import get_evaluation_criteria_or_default
43
+ from .eval_metrics import BaseCriterion
40
44
from .eval_metrics import EvalMetric
41
45
from .eval_metrics import EvalMetricResult
42
46
from .eval_metrics import PrebuiltMetrics
72
76
EXPECTED_TOOL_USE_COLUMN = "expected_tool_use"
73
77
74
78
75
- DEFAULT_CRITERIA = {
76
- TOOL_TRAJECTORY_SCORE_KEY : 1.0 , # 1-point scale; 1.0 is perfect.
77
- RESPONSE_MATCH_SCORE_KEY : 0.8 , # Rouge-1 text match; 0.8 is default.
78
- }
79
-
80
-
81
79
def load_json (file_path : str ) -> Union [Dict , List ]:
82
80
with open (file_path , "r" ) as f :
83
81
return json .load (f )
@@ -99,28 +97,18 @@ class AgentEvaluator:
99
97
"""An evaluator for Agents, mainly intended for helping with test cases."""
100
98
101
99
@staticmethod
102
- def find_config_for_test_file (test_file : str ):
100
+ def find_config_for_test_file (test_file : str ) -> EvalConfig :
103
101
"""Find the test_config.json file in the same folder as the test file."""
104
102
test_folder = os .path .dirname (test_file )
105
103
config_path = os .path .join (test_folder , "test_config.json" )
106
- if os .path .exists (config_path ):
107
- config_data = load_json (config_path )
108
- if "criteria" in config_data and isinstance (
109
- config_data ["criteria" ], dict
110
- ):
111
- return config_data ["criteria" ]
112
- else :
113
- raise ValueError (
114
- f"Invalid format for test_config.json at { config_path } . Expected a"
115
- " 'criteria' dictionary."
116
- )
117
- return DEFAULT_CRITERIA
104
+ return get_evaluation_criteria_or_default (config_path )
118
105
119
106
@staticmethod
120
107
async def evaluate_eval_set (
121
108
agent_module : str ,
122
109
eval_set : EvalSet ,
123
- criteria : dict [str , float ],
110
+ criteria : Optional [dict [str , float ]] = None ,
111
+ eval_config : Optional [EvalConfig ] = None ,
124
112
num_runs : int = NUM_RUNS ,
125
113
agent_name : Optional [str ] = None ,
126
114
print_detailed_results : bool = True ,
@@ -133,20 +121,33 @@ async def evaluate_eval_set(
133
121
look for 'root_agent' in the loaded module.
134
122
eval_set: The eval set.
135
123
criteria: Evauation criterias, a dictionary of metric names to their
136
- respective thresholds.
124
+ respective thresholds. This field is deprecated.
125
+ eval_config: The evauation config.
137
126
num_runs: Number of times all entries in the eval dataset should be
138
127
assessed.
139
128
agent_name: The name of the agent, if trying to evaluate something other
140
129
than root agent. If left empty or none, then root agent is evaluated.
141
130
print_detailed_results: Whether to print detailed results for each metric
142
131
evaluation.
143
132
"""
133
+ if criteria :
134
+ logger .warning (
135
+ "`criteria` field is deprecated and will be removed in future"
136
+ " iterations. For now, we will automatically map values in `criteria`"
137
+ " to `eval_config`, but you should move to using `eval_config` field."
138
+ )
139
+ base_criteria = {
140
+ k : BaseCriterion (threshold = v ) for k , v in criteria .items ()
141
+ }
142
+ eval_config = EvalConfig (criteria = base_criteria )
143
+
144
+ if eval_config is None :
145
+ raise ValueError ("`eval_config` is required." )
146
+
144
147
agent_for_eval = AgentEvaluator ._get_agent_for_eval (
145
148
module_name = agent_module , agent_name = agent_name
146
149
)
147
- eval_metrics = [
148
- EvalMetric (metric_name = n , threshold = t ) for n , t in criteria .items ()
149
- ]
150
+ eval_metrics = get_eval_metrics_from_config (eval_config )
150
151
151
152
# Step 1: Perform evals, basically inferencing and evaluation of metrics
152
153
eval_results_by_eval_id = await AgentEvaluator ._get_eval_results_by_eval_id (
@@ -226,15 +227,15 @@ async def evaluate(
226
227
initial_session = AgentEvaluator ._get_initial_session (initial_session_file )
227
228
228
229
for test_file in test_files :
229
- criteria = AgentEvaluator .find_config_for_test_file (test_file )
230
+ eval_config = AgentEvaluator .find_config_for_test_file (test_file )
230
231
eval_set = AgentEvaluator ._load_eval_set_from_file (
231
- test_file , criteria , initial_session
232
+ test_file , eval_config , initial_session
232
233
)
233
234
234
235
await AgentEvaluator .evaluate_eval_set (
235
236
agent_module = agent_module ,
236
237
eval_set = eval_set ,
237
- criteria = criteria ,
238
+ eval_config = eval_config ,
238
239
num_runs = num_runs ,
239
240
agent_name = agent_name ,
240
241
print_detailed_results = print_detailed_results ,
@@ -252,11 +253,11 @@ def migrate_eval_data_to_new_schema(
252
253
"One of old_eval_data_file or new_eval_data_file is empty."
253
254
)
254
255
255
- criteria = AgentEvaluator .find_config_for_test_file (old_eval_data_file )
256
+ eval_config = AgentEvaluator .find_config_for_test_file (old_eval_data_file )
256
257
initial_session = AgentEvaluator ._get_initial_session (initial_session_file )
257
258
258
259
eval_set = AgentEvaluator ._get_eval_set_from_old_format (
259
- old_eval_data_file , criteria , initial_session
260
+ old_eval_data_file , eval_config , initial_session
260
261
)
261
262
262
263
with open (new_eval_data_file , "w" ) as f :
@@ -265,7 +266,7 @@ def migrate_eval_data_to_new_schema(
265
266
@staticmethod
266
267
def _load_eval_set_from_file (
267
268
eval_set_file : str ,
268
- criteria : dict [ str , float ] ,
269
+ eval_config : EvalConfig ,
269
270
initial_session : dict [str , Any ],
270
271
) -> EvalSet :
271
272
"""Loads an EvalSet from the given file."""
@@ -292,17 +293,17 @@ def _load_eval_set_from_file(
292
293
293
294
# If we are here, the data must be specified in the older format.
294
295
return AgentEvaluator ._get_eval_set_from_old_format (
295
- eval_set_file , criteria , initial_session
296
+ eval_set_file , eval_config , initial_session
296
297
)
297
298
298
299
@staticmethod
299
300
def _get_eval_set_from_old_format (
300
301
eval_set_file : str ,
301
- criteria : dict [ str , float ] ,
302
+ eval_config : EvalConfig ,
302
303
initial_session : dict [str , Any ],
303
304
) -> EvalSet :
304
305
data = AgentEvaluator ._load_dataset (eval_set_file )[0 ]
305
- AgentEvaluator ._validate_input ([data ], criteria )
306
+ AgentEvaluator ._validate_input ([data ], eval_config . criteria )
306
307
eval_data = {
307
308
"name" : eval_set_file ,
308
309
"data" : data ,
0 commit comments