@@ -67,9 +67,9 @@ def test_tool_description_has_grounding_instructions(self):
6767 ]
6868
6969 for keyword in grounding_keywords :
70- assert (
71- keyword in tool_description
72- ), f"Tool description missing keyword: { keyword } "
70+ assert keyword in tool_description , (
71+ f"Tool description missing keyword: { keyword } "
72+ )
7373 print (f"✓ Found: { keyword } " )
7474
7575 print (
@@ -107,9 +107,9 @@ async def test_judge_evaluation_of_tool_created_memories(self):
107107 print (f"Scores: { evaluation } " )
108108
109109 # Well-grounded tool memory should score well
110- assert (
111- evaluation [" overall_score" ] >= 0.7
112- ), f"Well-grounded tool memory should score high: { evaluation [ 'overall_score' ] } "
110+ assert evaluation [ "overall_score" ] >= 0.7 , (
111+ f"Well-grounded tool memory should score high: { evaluation [' overall_score' ] } "
112+ )
113113
114114 # Test case: Poorly grounded tool memory
115115 poor_grounded_memory = "He has extensive backend experience. She specializes in React. They collaborate effectively."
@@ -133,9 +133,9 @@ async def test_judge_evaluation_of_tool_created_memories(self):
133133
134134 # Both should at least be evaluated successfully
135135 assert evaluation ["overall_score" ] >= 0.7 , "Good grounding should score well"
136- assert (
137- poor_evaluation [ "overall_score" ] >= 0.0
138- ), "Poor grounding should still be evaluated"
136+ assert poor_evaluation [ "overall_score" ] >= 0.0 , (
137+ "Poor grounding should still be evaluated"
138+ )
139139
140140 @pytest .mark .requires_api_keys
141141 async def test_realistic_tool_usage_scenario (self ):
@@ -194,12 +194,12 @@ async def test_realistic_tool_usage_scenario(self):
194194 print (f"Evaluation: { evaluation } " )
195195
196196 # Should demonstrate good contextual grounding
197- assert (
198- evaluation [ "pronoun_resolution_score" ] >= 0.8
199- ), "Should properly ground 'she' to 'Maria'"
200- assert (
201- evaluation [" overall_score" ] >= 0.6
202- ), f"Realistic tool usage should show good grounding: { evaluation [ 'overall_score' ] } "
197+ assert evaluation [ "pronoun_resolution_score" ] >= 0.8 , (
198+ "Should properly ground 'she' to 'Maria'"
199+ )
200+ assert evaluation [ "overall_score" ] >= 0.6 , (
201+ f"Realistic tool usage should show good grounding: { evaluation [' overall_score' ] } "
202+ )
203203
204204 print (
205205 "✓ Tool-based memory creation with proper contextual grounding successful"
0 commit comments