@@ -47,9 +47,11 @@ async def flow_side_effect(timeout, **kwargs):
4747 if invalid_calls > 0 :
4848 # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
4949 return {
50- "chain_of_thought" : "The tool calls were very correct that I returned a huge number!" ,
51- "tool_calls_success_level" : 25 ,
52- "additional_details" : {},
50+ "llm_output" : {
51+ "chain_of_thought" : "The tool calls were very correct that I returned a huge number!" ,
52+ "tool_calls_success_level" : 25 ,
53+ "details" : {},
54+ }
5355 }
5456
5557 score = 1 # Default score for "all bad"
@@ -60,12 +62,14 @@ async def flow_side_effect(timeout, **kwargs):
6062 score = 3 # Mixed good and bad
6163
6264 return {
63- "chain_of_thought" : f"Evaluated { total_calls } tool calls with { total_good_calls } correct calls." ,
64- "tool_calls_success_level" : score ,
65- "additional_details" : {
66- "tool_calls_made_by_agent" : total_calls ,
67- "correct_tool_calls_made_by_agent" : total_good_calls ,
68- },
65+ "llm_output" : {
66+ "chain_of_thought" : f"Evaluated { total_calls } tool calls with { total_good_calls } correct calls." ,
67+ "tool_calls_success_level" : score ,
68+ "details" : {
69+ "tool_calls_made_by_agent" : total_calls ,
70+ "correct_tool_calls_made_by_agent" : total_good_calls ,
71+ },
72+ }
6973 }
7074
7175
@@ -132,7 +136,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
132136 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
133137 assert f"{ key } _reason" in result
134138 assert result [f"{ key } _reason" ] == "Evaluated 2 tool calls with 1 correct calls."
135- assert "details " in result
139+ assert f" { key } _details " in result
136140
137141 def test_evaluate_tools_valid2 (self , mock_model_config ):
138142 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -194,7 +198,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
194198 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
195199 assert f"{ key } _reason" in result
196200 assert result [f"{ key } _reason" ] == "Evaluated 2 tool calls with 0 correct calls."
197- assert "details " in result
201+ assert f" { key } _details " in result
198202
199203 def test_evaluate_tools_valid3 (self , mock_model_config ):
200204 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -256,7 +260,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
256260 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
257261 assert f"{ key } _reason" in result
258262 assert result [f"{ key } _reason" ] == "Evaluated 2 tool calls with 2 correct calls."
259- assert "details " in result
263+ assert f" { key } _details " in result
260264
261265 def test_evaluate_tools_one_eval_fails (self , mock_model_config ):
262266 with pytest .raises (EvaluationException ) as exc_info :
@@ -351,7 +355,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
351355 assert result [f"{ key } _result" ] == "pass"
352356 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
353357 assert result [f"{ key } _reason" ] == ToolCallAccuracyEvaluator ._TOOL_DEFINITIONS_MISSING_MESSAGE
354- assert result ["details " ] == {}
358+ assert result [f" { key } _details " ] == {}
355359
356360 def test_evaluate_tools_all_not_applicable (self , mock_model_config ):
357361 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -391,7 +395,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
391395 assert result [f"{ key } _result" ] == "pass"
392396 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
393397 assert result [f"{ key } _reason" ] == ToolCallAccuracyEvaluator ._TOOL_DEFINITIONS_MISSING_MESSAGE
394- assert result ["details " ] == {}
398+ assert result [f" { key } _details " ] == {}
395399
396400 def test_evaluate_tools_no_tools (self , mock_model_config ):
397401 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -424,7 +428,7 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
424428 assert result [f"{ key } _result" ] == "pass"
425429 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
426430 assert result [f"{ key } _reason" ] == ToolCallAccuracyEvaluator ._NO_TOOL_CALLS_MESSAGE
427- assert result ["details " ] == {}
431+ assert result [f" { key } _details " ] == {}
428432
429433 def test_evaluate_bing_custom_search (self , mock_model_config ):
430434 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
0 commit comments